howard.objects.variants

   1import csv
   2import gc
   3import gzip
   4import io
   5import multiprocessing
   6import os
   7import random
   8import re
   9import shlex
  10import sqlite3
  11import subprocess
  12from tempfile import NamedTemporaryFile, TemporaryDirectory
  13import tempfile
  14import duckdb
  15import json
  16import yaml
  17import argparse
  18import Bio.bgzf as bgzf
  19import pandas as pd
  20from pyfaidx import Fasta
  21import numpy as np
  22import vcf
  23import logging as log
  24import fastparquet as fp
  25from multiprocesspandas import applyparallel
  26
  27from howard.functions.commons import *
  28from howard.objects.database import *
  29from howard.functions.databases import *
  30from howard.functions.utils import *
  31
  32
  33class Variants:
  34
  35    def __init__(
  36        self,
  37        conn=None,
  38        input: str = None,
  39        output: str = None,
  40        config: dict = {},
  41        param: dict = {},
  42        load: bool = False,
  43    ) -> None:
  44        """
  45        The function `__init__` initializes the variables, sets the input, output, config, param, connexion and
  46        header
  47
  48        :param conn: the connection to the database
  49        :param input: the input file
  50        :param output: the output file
  51        :param config: a dictionary containing the configuration of the model
  52        :param param: a dictionary containing the parameters of the model
  53        """
  54
  55        # Init variables
  56        self.init_variables()
  57
  58        # Input
  59        self.set_input(input)
  60
  61        # Config
  62        self.set_config(config)
  63
  64        # Param
  65        self.set_param(param)
  66
  67        # Output
  68        self.set_output(output)
  69
  70        # connexion
  71        self.set_connexion(conn)
  72
  73        # Header
  74        self.set_header()
  75
  76        # Load data
  77        if load:
  78            self.load_data()
  79
  80    def set_input(self, input: str = None) -> None:
  81        """
  82        The function `set_input` takes a file name as input, extracts the name and extension, and sets
  83        attributes in the class accordingly.
  84
  85        :param input: The `set_input` method in the provided code snippet is used to set attributes
  86        related to the input file. Here's a breakdown of the parameters and their usage in the method:
  87        :type input: str
  88        """
  89
  90        if input and not isinstance(input, str):
  91            try:
  92                self.input = input.name
  93            except:
  94                log.error(f"Input file '{input} in bad format")
  95                raise ValueError(f"Input file '{input} in bad format")
  96        else:
  97            self.input = input
  98
  99        # Input format
 100        if input:
 101            input_name, input_extension = os.path.splitext(self.input)
 102            self.input_name = input_name
 103            self.input_extension = input_extension
 104            self.input_format = self.input_extension.replace(".", "")
 105
 106    def set_config(self, config: dict) -> None:
 107        """
 108        The set_config function takes a config object and assigns it as the configuration object for the
 109        class.
 110
 111        :param config: The `config` parameter in the `set_config` function is a dictionary object that
 112        contains configuration settings for the class. When you call the `set_config` function with a
 113        dictionary object as the argument, it will set that dictionary as the configuration object for
 114        the class
 115        :type config: dict
 116        """
 117
 118        self.config = config
 119
 120    def set_param(self, param: dict) -> None:
 121        """
 122        This function sets a parameter object for the class based on the input dictionary.
 123
 124        :param param: The `set_param` method you provided takes a dictionary object as input and sets it
 125        as the `param` attribute of the class instance
 126        :type param: dict
 127        """
 128
 129        self.param = param
 130
 131    def init_variables(self) -> None:
 132        """
 133        This function initializes the variables that will be used in the rest of the class
 134        """
 135
 136        self.prefix = "howard"
 137        self.table_variants = "variants"
 138        self.dataframe = None
 139
 140        self.comparison_map = {
 141            "gt": ">",
 142            "gte": ">=",
 143            "lt": "<",
 144            "lte": "<=",
 145            "equals": "=",
 146            "contains": "SIMILAR TO",
 147        }
 148
 149        self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3}
 150
 151        self.code_type_map_to_sql = {
 152            "Integer": "INTEGER",
 153            "String": "VARCHAR",
 154            "Float": "FLOAT",
 155            "Flag": "VARCHAR",
 156        }
 157
 158        self.index_additionnal_fields = []
 159
 160    def get_indexing(self) -> bool:
 161        """
 162        It returns the value of the key "indexing" in the dictionary. If the key is not present, it
 163        returns False.
 164        :return: The value of the indexing parameter.
 165        """
 166
 167        return self.get_param().get("indexing", False)
 168
 169    def get_connexion_config(self) -> dict:
 170        """
 171        The function `get_connexion_config` returns a dictionary containing the configuration for a
 172        connection, including the number of threads and memory limit.
 173        :return: a dictionary containing the configuration for the Connexion library.
 174        """
 175
 176        # config
 177        config = self.get_config()
 178
 179        # Connexion config
 180        connexion_config = {}
 181        threads = self.get_threads()
 182
 183        # Threads
 184        if threads:
 185            connexion_config["threads"] = threads
 186
 187        # Memory
 188        # if config.get("memory", None):
 189        #     connexion_config["memory_limit"] = config.get("memory")
 190        if self.get_memory():
 191            connexion_config["memory_limit"] = self.get_memory()
 192
 193        # Temporary directory
 194        if config.get("tmp", None):
 195            connexion_config["temp_directory"] = config.get("tmp")
 196
 197        # Access
 198        if config.get("access", None):
 199            access = config.get("access")
 200            if access in ["RO"]:
 201                access = "READ_ONLY"
 202            elif access in ["RW"]:
 203                access = "READ_WRITE"
 204            connexion_db = self.get_connexion_db()
 205            if connexion_db in ":memory:":
 206                access = "READ_WRITE"
 207            connexion_config["access_mode"] = access
 208
 209        return connexion_config
 210
 211    def get_duckdb_settings(self) -> dict:
 212        """
 213        The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a
 214        string.
 215        :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`.
 216        """
 217
 218        # config
 219        config = self.get_config()
 220
 221        # duckdb settings
 222        duckdb_settings_dict = {}
 223        if config.get("duckdb_settings", None):
 224            duckdb_settings = config.get("duckdb_settings")
 225            duckdb_settings = full_path(duckdb_settings)
 226            # duckdb setting is a file
 227            if os.path.exists(duckdb_settings):
 228                with open(duckdb_settings) as json_file:
 229                    duckdb_settings_dict = yaml.safe_load(json_file)
 230            # duckdb settings is a string
 231            else:
 232                duckdb_settings_dict = json.loads(duckdb_settings)
 233
 234        return duckdb_settings_dict
 235
 236    def set_connexion_db(self) -> str:
 237        """
 238        The function `set_connexion_db` returns the appropriate database connection string based on the
 239        input format and connection type.
 240        :return: the value of the variable `connexion_db`.
 241        """
 242
 243        # Default connexion db
 244        default_connexion_db = ":memory:"
 245
 246        # Find connexion db
 247        if self.get_input_format() in ["db", "duckdb"]:
 248            connexion_db = self.get_input()
 249        elif self.get_connexion_type() in ["memory", default_connexion_db, None]:
 250            connexion_db = default_connexion_db
 251        elif self.get_connexion_type() in ["tmpfile"]:
 252            tmp_name = tempfile.mkdtemp(
 253                prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db"
 254            )
 255            connexion_db = f"{tmp_name}/tmp.db"
 256        elif self.get_connexion_type() != "":
 257            connexion_db = self.get_connexion_type()
 258        else:
 259            connexion_db = default_connexion_db
 260
 261        # Set connexion db
 262        self.connexion_db = connexion_db
 263
 264        return connexion_db
 265
 266    def set_connexion(self, conn) -> None:
 267        """
 268        The function `set_connexion` creates a connection to a database, with options for different
 269        database formats and settings.
 270
 271        :param conn: The `conn` parameter in the `set_connexion` method is the connection to the
 272        database. If a connection is not provided, a new connection to an in-memory database is created.
 273        The method then proceeds to set up the connection based on the specified format (e.g., duckdb or
 274        sqlite
 275        """
 276
 277        # Connexion db
 278        connexion_db = self.set_connexion_db()
 279
 280        # Connexion config
 281        connexion_config = self.get_connexion_config()
 282
 283        # Connexion format
 284        connexion_format = self.get_config().get("connexion_format", "duckdb")
 285        # Set connexion format
 286        self.connexion_format = connexion_format
 287
 288        # Connexion
 289        if not conn:
 290            if connexion_format in ["duckdb"]:
 291                conn = duckdb.connect(connexion_db, config=connexion_config)
 292                # duckDB settings
 293                duckdb_settings = self.get_duckdb_settings()
 294                if duckdb_settings:
 295                    for setting in duckdb_settings:
 296                        setting_value = duckdb_settings.get(setting)
 297                        if isinstance(setting_value, str):
 298                            setting_value = f"'{setting_value}'"
 299                        conn.execute(f"PRAGMA {setting}={setting_value};")
 300            elif connexion_format in ["sqlite"]:
 301                conn = sqlite3.connect(connexion_db)
 302
 303        # Set connexion
 304        self.conn = conn
 305
 306        # Log
 307        log.debug(f"connexion_format: {connexion_format}")
 308        log.debug(f"connexion_db: {connexion_db}")
 309        log.debug(f"connexion config: {connexion_config}")
 310        log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}")
 311
 312    def set_output(self, output: str = None) -> None:
 313        """
 314        The `set_output` function in Python sets the output file based on the input or a specified key
 315        in the config file, extracting the output name, extension, and format.
 316
 317        :param output: The `output` parameter in the `set_output` method is used to specify the name of
 318        the output file. If the config file has an 'output' key, the method sets the output to the value
 319        of that key. If no output is provided, it sets the output to `None`
 320        :type output: str
 321        """
 322
 323        if output and not isinstance(output, str):
 324            self.output = output.name
 325        else:
 326            self.output = output
 327
 328        # Output format
 329        if self.output:
 330            output_name, output_extension = os.path.splitext(self.output)
 331            self.output_name = output_name
 332            self.output_extension = output_extension
 333            self.output_format = self.output_extension.replace(".", "")
 334        else:
 335            self.output_name = None
 336            self.output_extension = None
 337            self.output_format = None
 338
 339    def set_header(self) -> None:
 340        """
 341        It reads the header of a VCF file and stores it as a list of strings and as a VCF object
 342        """
 343
 344        input_file = self.get_input()
 345        default_header_list = [
 346            "##fileformat=VCFv4.2",
 347            "#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO",
 348        ]
 349
 350        # Full path
 351        input_file = full_path(input_file)
 352
 353        if input_file:
 354
 355            input_format = self.get_input_format()
 356            input_compressed = self.get_input_compressed()
 357            config = self.get_config()
 358            header_list = default_header_list
 359            if input_format in [
 360                "vcf",
 361                "hdr",
 362                "tsv",
 363                "csv",
 364                "psv",
 365                "parquet",
 366                "db",
 367                "duckdb",
 368            ]:
 369                # header provided in param
 370                if config.get("header_file", None):
 371                    with open(config.get("header_file"), "rt") as f:
 372                        header_list = self.read_vcf_header(f)
 373                # within a vcf file format (header within input file itsself)
 374                elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file):
 375                    # within a compressed vcf file format (.vcf.gz)
 376                    if input_compressed:
 377                        with bgzf.open(input_file, "rt") as f:
 378                            header_list = self.read_vcf_header(f)
 379                    # within an uncompressed vcf file format (.vcf)
 380                    else:
 381                        with open(input_file, "rt") as f:
 382                            header_list = self.read_vcf_header(f)
 383                # header provided in default external file .hdr
 384                elif os.path.exists((input_file + ".hdr")):
 385                    with open(input_file + ".hdr", "rt") as f:
 386                        header_list = self.read_vcf_header(f)
 387                else:
 388                    try:  # Try to get header info fields and file columns
 389
 390                        with tempfile.TemporaryDirectory() as tmpdir:
 391
 392                            # Create database
 393                            db_for_header = Database(database=input_file)
 394
 395                            # Get header columns for infos fields
 396                            db_header_from_columns = (
 397                                db_for_header.get_header_from_columns()
 398                            )
 399
 400                            # Get real columns in the file
 401                            db_header_columns = db_for_header.get_columns()
 402
 403                            # Write header file
 404                            header_file_tmp = os.path.join(tmpdir, "header")
 405                            f = open(header_file_tmp, "w")
 406                            vcf.Writer(f, db_header_from_columns)
 407                            f.close()
 408
 409                            # Replace #CHROM line with rel columns
 410                            header_list = db_for_header.read_header_file(
 411                                header_file=header_file_tmp
 412                            )
 413                            header_list[-1] = "\t".join(db_header_columns)
 414
 415                    except:
 416
 417                        log.warning(
 418                            f"No header for file {input_file}. Set as default VCF header"
 419                        )
 420                        header_list = default_header_list
 421
 422            else:  # try for unknown format ?
 423
 424                log.error(f"Input file format '{input_format}' not available")
 425                raise ValueError(f"Input file format '{input_format}' not available")
 426
 427            if not header_list:
 428                header_list = default_header_list
 429
 430            # header as list
 431            self.header_list = header_list
 432
 433            # header as VCF object
 434            self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list)))
 435
 436        else:
 437
 438            self.header_list = None
 439            self.header_vcf = None
 440
 441    def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame:
 442        """
 443        The `get_query_to_df` function takes a query as a string and returns the result as a pandas
 444        DataFrame based on the connection format.
 445
 446        :param query: The `query` parameter in the `get_query_to_df` function is a string that
 447        represents the SQL query you want to execute. This query will be used to fetch data from a
 448        database and convert it into a pandas DataFrame
 449        :type query: str
 450        :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the
 451        maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the
 452        function will only fetch up to that number of rows from the database query result. If no limit
 453        is specified,
 454        :type limit: int
 455        :return: A pandas DataFrame is being returned by the `get_query_to_df` function.
 456        """
 457
 458        # Connexion format
 459        connexion_format = self.get_connexion_format()
 460
 461        # Limit in query
 462        if limit:
 463            pd.set_option("display.max_rows", limit)
 464            if connexion_format in ["duckdb"]:
 465                df = (
 466                    self.conn.execute(query)
 467                    .fetch_record_batch(limit)
 468                    .read_next_batch()
 469                    .to_pandas()
 470                )
 471            elif connexion_format in ["sqlite"]:
 472                df = next(pd.read_sql_query(query, self.conn, chunksize=limit))
 473
 474        # Full query
 475        else:
 476            if connexion_format in ["duckdb"]:
 477                df = self.conn.execute(query).df()
 478            elif connexion_format in ["sqlite"]:
 479                df = pd.read_sql_query(query, self.conn)
 480
 481        return df
 482
 483    def get_overview(self) -> None:
 484        """
 485        The function prints the input, output, config, and dataframe of the current object
 486        """
 487        table_variants_from = self.get_table_variants(clause="from")
 488        sql_columns = self.get_header_columns_as_sql()
 489        sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}"
 490        df = self.get_query_to_df(sql_query_export)
 491        log.info(
 492            "Input:  "
 493            + str(self.get_input())
 494            + " ["
 495            + str(str(self.get_input_format()))
 496            + "]"
 497        )
 498        log.info(
 499            "Output: "
 500            + str(self.get_output())
 501            + " ["
 502            + str(str(self.get_output_format()))
 503            + "]"
 504        )
 505        log.info("Config: ")
 506        for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split(
 507            "\n"
 508        ):
 509            log.info("\t" + str(d))
 510        log.info("Param: ")
 511        for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split(
 512            "\n"
 513        ):
 514            log.info("\t" + str(d))
 515        log.info("Sample list: " + str(self.get_header_sample_list()))
 516        log.info("Dataframe: ")
 517        for d in str(df).split("\n"):
 518            log.info("\t" + str(d))
 519
 520        # garbage collector
 521        del df
 522        gc.collect()
 523
 524        return None
 525
 526    def get_stats(self) -> dict:
 527        """
 528        The `get_stats` function calculates and returns various statistics of the current object,
 529        including information about the input file, variants, samples, header fields, quality, and
 530        SNVs/InDels.
 531        :return: a dictionary containing various statistics of the current object. The dictionary has
 532        the following structure:
 533        """
 534
 535        # Log
 536        log.info(f"Stats Calculation...")
 537
 538        # table varaints
 539        table_variants_from = self.get_table_variants()
 540
 541        # stats dict
 542        stats = {"Infos": {}}
 543
 544        ### File
 545        input_file = self.get_input()
 546        stats["Infos"]["Input file"] = input_file
 547
 548        # Header
 549        header_infos = self.get_header().infos
 550        header_formats = self.get_header().formats
 551        header_infos_list = list(header_infos)
 552        header_formats_list = list(header_formats)
 553
 554        ### Variants
 555
 556        stats["Variants"] = {}
 557
 558        # Variants by chr
 559        sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"'
 560        df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom)
 561        nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values(
 562            by=["CHROM"], kind="quicksort"
 563        )
 564
 565        # Total number of variants
 566        nb_of_variants = nb_of_variants_by_chrom["count"].sum()
 567
 568        # Calculate percentage
 569        nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply(
 570            lambda x: (x / nb_of_variants)
 571        )
 572
 573        stats["Variants"]["Number of variants by chromosome"] = (
 574            nb_of_variants_by_chrom.to_dict(orient="index")
 575        )
 576
 577        stats["Infos"]["Number of variants"] = int(nb_of_variants)
 578
 579        ### Samples
 580
 581        # Init
 582        samples = {}
 583        nb_of_samples = 0
 584
 585        # Check Samples
 586        if "GT" in header_formats_list and "FORMAT" in self.get_header_columns():
 587            log.debug(f"Check samples...")
 588            for sample in self.get_header_sample_list():
 589                sql_query_samples = f"""
 590                    SELECT  '{sample}' as sample,
 591                            REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype,
 592                            count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count,
 593                            concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage
 594                    FROM {table_variants_from}
 595                    WHERE (
 596                        regexp_matches("{sample}", '^[0-9]([/|][0-9])+')
 597                        AND
 598                        len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':'))
 599                      )
 600                    GROUP BY genotype
 601                    """
 602                sql_query_genotype_df = self.conn.execute(sql_query_samples).df()
 603                sample_genotype_count = sql_query_genotype_df["count"].sum()
 604                if len(sql_query_genotype_df):
 605                    nb_of_samples += 1
 606                    samples[f"{sample} - {sample_genotype_count} variants"] = (
 607                        sql_query_genotype_df.to_dict(orient="index")
 608                    )
 609
 610            stats["Samples"] = samples
 611            stats["Infos"]["Number of samples"] = nb_of_samples
 612
 613        # #
 614        # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list:
 615        #     stats["Infos"]["Number of samples"] = nb_of_samples
 616        # elif nb_of_samples:
 617        #     stats["Infos"]["Number of samples"] = "not a VCF format"
 618
 619        ### INFO and FORMAT fields
 620        header_types_df = {}
 621        header_types_list = {
 622            "List of INFO fields": header_infos,
 623            "List of FORMAT fields": header_formats,
 624        }
 625        i = 0
 626        for header_type in header_types_list:
 627
 628            header_type_infos = header_types_list.get(header_type)
 629            header_infos_dict = {}
 630
 631            for info in header_type_infos:
 632
 633                i += 1
 634                header_infos_dict[i] = {}
 635
 636                # ID
 637                header_infos_dict[i]["id"] = info
 638
 639                # num
 640                genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"}
 641                if header_type_infos[info].num in genotype_map.keys():
 642                    header_infos_dict[i]["Number"] = genotype_map.get(
 643                        header_type_infos[info].num
 644                    )
 645                else:
 646                    header_infos_dict[i]["Number"] = header_type_infos[info].num
 647
 648                # type
 649                if header_type_infos[info].type:
 650                    header_infos_dict[i]["Type"] = header_type_infos[info].type
 651                else:
 652                    header_infos_dict[i]["Type"] = "."
 653
 654                # desc
 655                if header_type_infos[info].desc != None:
 656                    header_infos_dict[i]["Description"] = header_type_infos[info].desc
 657                else:
 658                    header_infos_dict[i]["Description"] = ""
 659
 660            if len(header_infos_dict):
 661                header_types_df[header_type] = pd.DataFrame.from_dict(
 662                    header_infos_dict, orient="index"
 663                ).to_dict(orient="index")
 664
 665        # Stats
 666        stats["Infos"]["Number of INFO fields"] = len(header_infos_list)
 667        stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list)
 668        stats["Header"] = header_types_df
 669
 670        ### QUAL
 671        if "QUAL" in self.get_header_columns():
 672            sql_query_qual = f"""
 673                    SELECT
 674                        avg(CAST(QUAL AS INTEGER)) AS Average,
 675                        min(CAST(QUAL AS INTEGER)) AS Minimum,
 676                        max(CAST(QUAL AS INTEGER)) AS Maximum,
 677                        stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation,
 678                        median(CAST(QUAL AS INTEGER)) AS Median,
 679                        variance(CAST(QUAL AS INTEGER)) AS Variance
 680                    FROM {table_variants_from}
 681                    WHERE CAST(QUAL AS VARCHAR) NOT IN ('.')
 682                    """
 683
 684            qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index")
 685            stats["Quality"] = {"Stats": qual}
 686
 687        ### SNV and InDel
 688
 689        sql_query_snv = f"""
 690            
 691            SELECT Type, count FROM (
 692
 693                    SELECT
 694                        'Total' AS Type,
 695                        count(*) AS count
 696                    FROM {table_variants_from}
 697
 698                    UNION
 699
 700                    SELECT
 701                        'MNV' AS Type,
 702                        count(*) AS count
 703                    FROM {table_variants_from}
 704                    WHERE len(REF) > 1 AND len(ALT) > 1
 705                    AND len(REF) = len(ALT)
 706
 707                    UNION
 708
 709                    SELECT
 710                        'InDel' AS Type,
 711                        count(*) AS count
 712                    FROM {table_variants_from}
 713                    WHERE len(REF) > 1 OR len(ALT) > 1
 714                    AND len(REF) != len(ALT)
 715                    
 716                    UNION
 717
 718                    SELECT
 719                        'SNV' AS Type,
 720                        count(*) AS count
 721                    FROM {table_variants_from}
 722                    WHERE len(REF) = 1 AND len(ALT) = 1
 723
 724                )
 725
 726            ORDER BY count DESC
 727
 728                """
 729        snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index")
 730
 731        sql_query_snv_substitution = f"""
 732                SELECT
 733                    concat(REF, '>', ALT) AS 'Substitution',
 734                    count(*) AS count
 735                FROM {table_variants_from}
 736                WHERE len(REF) = 1 AND len(ALT) = 1
 737                GROUP BY REF, ALT
 738                ORDER BY count(*) DESC
 739                """
 740        snv_substitution = (
 741            self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index")
 742        )
 743        stats["Variants"]["Counts"] = snv_indel
 744        stats["Variants"]["Substitutions"] = snv_substitution
 745
 746        return stats
 747
 748    def stats_to_file(self, file: str = None) -> str:
 749        """
 750        The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them
 751        into a JSON object, and writes the JSON object to the specified file.
 752
 753        :param file: The `file` parameter is a string that represents the file path where the JSON data
 754        will be written
 755        :type file: str
 756        :return: the name of the file that was written to.
 757        """
 758
 759        # Get stats
 760        stats = self.get_stats()
 761
 762        # Serializing json
 763        json_object = json.dumps(stats, indent=4)
 764
 765        # Writing to sample.json
 766        with open(file, "w") as outfile:
 767            outfile.write(json_object)
 768
 769        return file
 770
 771    def print_stats(self, output_file: str = None, json_file: str = None) -> None:
 772        """
 773        The `print_stats` function generates a markdown file and prints the statistics contained in a
 774        JSON file in a formatted manner.
 775
 776        :param output_file: The `output_file` parameter is a string that specifies the path and filename
 777        of the output file where the stats will be printed in Markdown format. If no `output_file` is
 778        provided, a temporary directory will be created and the stats will be saved in a file named
 779        "stats.md" within that
 780        :type output_file: str
 781        :param json_file: The `json_file` parameter is a string that represents the path to the JSON
 782        file where the statistics will be saved. If no value is provided, a temporary directory will be
 783        created and a default file name "stats.json" will be used
 784        :type json_file: str
 785        :return: The function `print_stats` does not return any value. It has a return type annotation
 786        of `None`.
 787        """
 788
 789        # Full path
 790        output_file = full_path(output_file)
 791        json_file = full_path(json_file)
 792
 793        with tempfile.TemporaryDirectory() as tmpdir:
 794
 795            # Files
 796            if not output_file:
 797                output_file = os.path.join(tmpdir, "stats.md")
 798            if not json_file:
 799                json_file = os.path.join(tmpdir, "stats.json")
 800
 801            # Create folders
 802            if not os.path.exists(os.path.dirname(output_file)):
 803                Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True)
 804            if not os.path.exists(os.path.dirname(json_file)):
 805                Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True)
 806
 807            # Create stats JSON file
 808            stats_file = self.stats_to_file(file=json_file)
 809
 810            # Print stats file
 811            with open(stats_file) as f:
 812                stats = yaml.safe_load(f)
 813
 814            # Output
 815            output_title = []
 816            output_index = []
 817            output = []
 818
 819            # Title
 820            output_title.append("# HOWARD Stats")
 821
 822            # Index
 823            output_index.append("## Index")
 824
 825            # Process sections
 826            for section in stats:
 827                infos = stats.get(section)
 828                section_link = "#" + section.lower().replace(" ", "-")
 829                output.append(f"## {section}")
 830                output_index.append(f"- [{section}]({section_link})")
 831
 832                if len(infos):
 833                    for info in infos:
 834                        try:
 835                            df = pd.DataFrame.from_dict(infos.get(info), orient="index")
 836                            is_df = True
 837                        except:
 838                            try:
 839                                df = pd.DataFrame.from_dict(
 840                                    json.loads((infos.get(info))), orient="index"
 841                                )
 842                                is_df = True
 843                            except:
 844                                is_df = False
 845                        if is_df:
 846                            output.append(f"### {info}")
 847                            info_link = "#" + info.lower().replace(" ", "-")
 848                            output_index.append(f"   - [{info}]({info_link})")
 849                            output.append(f"{df.to_markdown(index=False)}")
 850                        else:
 851                            output.append(f"- {info}: {infos.get(info)}")
 852                else:
 853                    output.append(f"NA")
 854
 855            # Write stats in markdown file
 856            with open(output_file, "w") as fp:
 857                for item in output_title:
 858                    fp.write("%s\n" % item)
 859                for item in output_index:
 860                    fp.write("%s\n" % item)
 861                for item in output:
 862                    fp.write("%s\n" % item)
 863
 864            # Output stats in markdown
 865            print("")
 866            print("\n\n".join(output_title))
 867            print("")
 868            print("\n\n".join(output))
 869            print("")
 870
 871        return None
 872
 873    def get_input(self) -> str:
 874        """
 875        It returns the value of the input variable.
 876        :return: The input is being returned.
 877        """
 878        return self.input
 879
 880    def get_input_format(self, input_file: str = None) -> str:
 881        """
 882        This function returns the format of the input variable, either from the provided input file or
 883        by prompting for input.
 884
 885        :param input_file: The `input_file` parameter in the `get_input_format` method is a string that
 886        represents the file path of the input file. If no `input_file` is provided when calling the
 887        method, it will default to `None`
 888        :type input_file: str
 889        :return: The format of the input variable is being returned.
 890        """
 891
 892        if not input_file:
 893            input_file = self.get_input()
 894        input_format = get_file_format(input_file)
 895        return input_format
 896
 897    def get_input_compressed(self, input_file: str = None) -> str:
 898        """
 899        The function `get_input_compressed` returns the format of the input variable after compressing
 900        it.
 901
 902        :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string
 903        that represents the file path of the input file. If no `input_file` is provided when calling the
 904        method, it will default to `None` and the method will then call `self.get_input()` to
 905        :type input_file: str
 906        :return: The function `get_input_compressed` returns the compressed format of the input
 907        variable.
 908        """
 909
 910        if not input_file:
 911            input_file = self.get_input()
 912        input_compressed = get_file_compressed(input_file)
 913        return input_compressed
 914
 915    def get_output(self) -> str:
 916        """
 917        It returns the output of the neuron.
 918        :return: The output of the neural network.
 919        """
 920
 921        return self.output
 922
 923    def get_output_format(self, output_file: str = None) -> str:
 924        """
 925        The function `get_output_format` returns the format of the input variable or the output file if
 926        provided.
 927
 928        :param output_file: The `output_file` parameter in the `get_output_format` method is a string
 929        that represents the file path of the output file. If no `output_file` is provided when calling
 930        the method, it will default to the output obtained from the `get_output` method of the class
 931        instance. The
 932        :type output_file: str
 933        :return: The format of the input variable is being returned.
 934        """
 935
 936        if not output_file:
 937            output_file = self.get_output()
 938        output_format = get_file_format(output_file)
 939
 940        return output_format
 941
 942    def get_config(self) -> dict:
 943        """
 944        It returns the config
 945        :return: The config variable is being returned.
 946        """
 947        return self.config
 948
 949    def get_param(self) -> dict:
 950        """
 951        It returns the param
 952        :return: The param variable is being returned.
 953        """
 954        return self.param
 955
 956    def get_connexion_db(self) -> str:
 957        """
 958        It returns the connexion_db attribute of the object
 959        :return: The connexion_db is being returned.
 960        """
 961        return self.connexion_db
 962
 963    def get_prefix(self) -> str:
 964        """
 965        It returns the prefix of the object.
 966        :return: The prefix is being returned.
 967        """
 968        return self.prefix
 969
 970    def get_table_variants(self, clause: str = "select") -> str:
 971        """
 972        This function returns the table_variants attribute of the object
 973
 974        :param clause: the type of clause the table will be used. Either "select" or "from" (optional),
 975        defaults to select (optional)
 976        :return: The table_variants attribute of the object.
 977        """
 978
 979        # Access
 980        access = self.get_config().get("access", None)
 981
 982        # Clauses "select", "where", "update"
 983        if clause in ["select", "where", "update"]:
 984            table_variants = self.table_variants
 985        # Clause "from"
 986        elif clause in ["from"]:
 987            # For Read Only
 988            if self.get_input_format() in ["parquet"] and access in ["RO"]:
 989                input_file = self.get_input()
 990                table_variants = f"'{input_file}' as variants"
 991            # For Read Write
 992            else:
 993                table_variants = f"{self.table_variants} as variants"
 994        else:
 995            table_variants = self.table_variants
 996        return table_variants
 997
 998    def get_tmp_dir(self) -> str:
 999        """
1000        The function `get_tmp_dir` returns the temporary directory path based on configuration
1001        parameters or a default path.
1002        :return: The `get_tmp_dir` method is returning the temporary directory path based on the
1003        configuration, parameters, and a default value of "/tmp".
1004        """
1005
1006        return get_tmp(
1007            config=self.get_config(), param=self.get_param(), default_tmp="/tmp"
1008        )
1009
1010    def get_connexion_type(self) -> str:
1011        """
1012        If the connexion type is not in the list of allowed connexion types, raise a ValueError
1013
1014        :return: The connexion type is being returned.
1015        """
1016        return self.get_config().get("connexion_type", "memory")
1017
1018    def get_connexion(self):
1019        """
1020        It returns the connection object
1021
1022        :return: The connection object.
1023        """
1024        return self.conn
1025
1026    def close_connexion(self) -> None:
1027        """
1028        This function closes the connection to the database.
1029        :return: The connection is being closed.
1030        """
1031        return self.conn.close()
1032
1033    def get_header(self, type: str = "vcf"):
1034        """
1035        This function returns the header of the VCF file as a list of strings
1036
1037        :param type: the type of header you want to get, defaults to vcf (optional)
1038        :return: The header of the vcf file.
1039        """
1040
1041        if self.header_vcf:
1042            if type == "vcf":
1043                return self.header_vcf
1044            elif type == "list":
1045                return self.header_list
1046        else:
1047            if type == "vcf":
1048                header = vcf.Reader(io.StringIO("\n".join(vcf_required)))
1049                return header
1050            elif type == "list":
1051                return vcf_required
1052
1053    def get_header_length(self, file: str = None) -> int:
1054        """
1055        The function `get_header_length` returns the length of the header list, excluding the #CHROM
1056        line.
1057
1058        :param file: The `file` parameter is an optional argument that specifies the path to a VCF
1059        header file. If this argument is provided, the function will read the header from the specified
1060        file and return the length of the header list minus 1 (to exclude the #CHROM line)
1061        :type file: str
1062        :return: the length of the header list, excluding the #CHROM line.
1063        """
1064
1065        if file:
1066            return len(self.read_vcf_header_file(file=file)) - 1
1067        elif self.get_header(type="list"):
1068            return len(self.get_header(type="list")) - 1
1069        else:
1070            return 0
1071
1072    def get_header_columns(self) -> str:
1073        """
1074        This function returns the header list of a VCF
1075
1076        :return: The length of the header list.
1077        """
1078        if self.get_header():
1079            return self.get_header(type="list")[-1]
1080        else:
1081            return ""
1082
1083    def get_header_columns_as_list(self) -> list:
1084        """
1085        This function returns the header list of a VCF
1086
1087        :return: The length of the header list.
1088        """
1089        if self.get_header():
1090            return self.get_header_columns().strip().split("\t")
1091        else:
1092            return []
1093
1094    def get_header_columns_as_sql(self) -> str:
1095        """
1096        This function retruns header length (without #CHROM line)
1097
1098        :return: The length of the header list.
1099        """
1100        sql_column_list = []
1101        for col in self.get_header_columns_as_list():
1102            sql_column_list.append(f'"{col}"')
1103        return ",".join(sql_column_list)
1104
1105    def get_header_sample_list(self) -> list:
1106        """
1107        This function retruns header length (without #CHROM line)
1108
1109        :return: The length of the header list.
1110        """
1111        return self.header_vcf.samples
1112
1113    def get_verbose(self) -> bool:
1114        """
1115        It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't
1116        exist
1117
1118        :return: The value of the key "verbose" in the config dictionary.
1119        """
1120        return self.get_config().get("verbose", False)
1121
1122    def get_connexion_format(self) -> str:
1123        """
1124        It returns the connexion format of the object.
1125        :return: The connexion_format is being returned.
1126        """
1127        connexion_format = self.connexion_format
1128        if connexion_format not in ["duckdb", "sqlite"]:
1129            log.error(f"Unknown connexion format {connexion_format}")
1130            raise ValueError(f"Unknown connexion format {connexion_format}")
1131        else:
1132            return connexion_format
1133
1134    def insert_file_to_table(
1135        self,
1136        file,
1137        columns: str,
1138        header_len: int = 0,
1139        sep: str = "\t",
1140        chunksize: int = 1000000,
1141    ) -> None:
1142        """
1143        The function reads a file in chunks and inserts each chunk into a table based on the specified
1144        database format.
1145
1146        :param file: The `file` parameter is the file that you want to load into a table. It should be
1147        the path to the file on your system
1148        :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that
1149        should contain the names of the columns in the table where the data will be inserted. The column
1150        names should be separated by commas within the string. For example, if you have columns named
1151        "id", "name
1152        :type columns: str
1153        :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies
1154        the number of lines to skip at the beginning of the file before reading the actual data. This
1155        parameter allows you to skip any header information present in the file before processing the
1156        data, defaults to 0
1157        :type header_len: int (optional)
1158        :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the
1159        separator character that is used in the file being read. In this case, the default separator is
1160        set to `\t`, which represents a tab character. You can change this parameter to a different
1161        separator character if, defaults to \t
1162        :type sep: str (optional)
1163        :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time
1164        when processing the file in chunks. In the provided code snippet, the default value for
1165        `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults
1166        to 1000000
1167        :type chunksize: int (optional)
1168        """
1169
1170        # Config
1171        chunksize = self.get_config().get("load", {}).get("chunk", chunksize)
1172        connexion_format = self.get_connexion_format()
1173
1174        log.debug("chunksize: " + str(chunksize))
1175
1176        if chunksize:
1177            for chunk in pd.read_csv(
1178                file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c"
1179            ):
1180                if connexion_format in ["duckdb"]:
1181                    sql_insert_into = (
1182                        f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk"
1183                    )
1184                    self.conn.execute(sql_insert_into)
1185                elif connexion_format in ["sqlite"]:
1186                    chunk.to_sql("variants", self.conn, if_exists="append", index=False)
1187
1188    def load_data(
1189        self,
1190        input_file: str = None,
1191        drop_variants_table: bool = False,
1192        sample_size: int = 20480,
1193    ) -> None:
1194        """
1195        The `load_data` function reads a VCF file and inserts it into a table, with options to drop the
1196        table before loading the data and specify a sample size.
1197
1198        :param input_file: The path to the input file. This is the VCF file that will be loaded into the
1199        table
1200        :type input_file: str
1201        :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that
1202        determines whether the variants table should be dropped before loading the data. If set to
1203        `True`, the variants table will be dropped. If set to `False` (default), the variants table will
1204        not be dropped, defaults to False
1205        :type drop_variants_table: bool (optional)
1206        :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from
1207        the input file. If it is set to `None`, the default value of 20480 will be used, defaults to
1208        20480
1209        :type sample_size: int (optional)
1210        """
1211
1212        log.info("Loading...")
1213
1214        # change input file
1215        if input_file:
1216            self.set_input(input_file)
1217            self.set_header()
1218
1219        # drop variants table
1220        if drop_variants_table:
1221            self.drop_variants_table()
1222
1223        # get table variants
1224        table_variants = self.get_table_variants()
1225
1226        # Access
1227        access = self.get_config().get("access", None)
1228        log.debug(f"access: {access}")
1229
1230        # Input format and compress
1231        input_format = self.get_input_format()
1232        input_compressed = self.get_input_compressed()
1233        log.debug(f"input_format: {input_format}")
1234        log.debug(f"input_compressed: {input_compressed}")
1235
1236        # input_compressed_format
1237        if input_compressed:
1238            input_compressed_format = "gzip"
1239        else:
1240            input_compressed_format = "none"
1241        log.debug(f"input_compressed_format: {input_compressed_format}")
1242
1243        # Connexion format
1244        connexion_format = self.get_connexion_format()
1245
1246        # Sample size
1247        if not sample_size:
1248            sample_size = -1
1249        log.debug(f"sample_size: {sample_size}")
1250
1251        # Load data
1252        log.debug(f"Load Data from {input_format}")
1253
1254        # DuckDB connexion
1255        if connexion_format in ["duckdb"]:
1256
1257            # Database already exists
1258            if self.input_format in ["db", "duckdb"]:
1259
1260                if connexion_format in ["duckdb"]:
1261                    log.debug(f"Input file format '{self.input_format}' duckDB")
1262                else:
1263                    log.error(
1264                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
1265                    )
1266                    raise ValueError(
1267                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
1268                    )
1269
1270            # Load from existing database format
1271            else:
1272
1273                try:
1274                    # Create Table or View
1275                    database = Database(database=self.input)
1276                    sql_from = database.get_sql_from(sample_size=sample_size)
1277
1278                    if access in ["RO"]:
1279                        sql_load = (
1280                            f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}"
1281                        )
1282                    else:
1283                        sql_load = (
1284                            f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}"
1285                        )
1286                    self.conn.execute(sql_load)
1287
1288                except:
1289                    # Format not available
1290                    log.error(f"Input file format '{self.input_format}' not available")
1291                    raise ValueError(
1292                        f"Input file format '{self.input_format}' not available"
1293                    )
1294
1295        # SQLite connexion
1296        elif connexion_format in ["sqlite"] and input_format in [
1297            "vcf",
1298            "tsv",
1299            "csv",
1300            "psv",
1301        ]:
1302
1303            # Main structure
1304            structure = {
1305                "#CHROM": "VARCHAR",
1306                "POS": "INTEGER",
1307                "ID": "VARCHAR",
1308                "REF": "VARCHAR",
1309                "ALT": "VARCHAR",
1310                "QUAL": "VARCHAR",
1311                "FILTER": "VARCHAR",
1312                "INFO": "VARCHAR",
1313            }
1314
1315            # Strcuture with samples
1316            structure_complete = structure
1317            if self.get_header_sample_list():
1318                structure["FORMAT"] = "VARCHAR"
1319                for sample in self.get_header_sample_list():
1320                    structure_complete[sample] = "VARCHAR"
1321
1322            # Columns list for create and insert
1323            sql_create_table_columns = []
1324            sql_create_table_columns_list = []
1325            for column in structure_complete:
1326                column_type = structure_complete[column]
1327                sql_create_table_columns.append(
1328                    f'"{column}" {column_type} default NULL'
1329                )
1330                sql_create_table_columns_list.append(f'"{column}"')
1331
1332            # Create database
1333            log.debug(f"Create Table {table_variants}")
1334            sql_create_table_columns_sql = ", ".join(sql_create_table_columns)
1335            sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list)
1336            sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})"
1337            self.conn.execute(sql_create_table)
1338
1339            # chunksize define length of file chunk load file
1340            chunksize = 100000
1341
1342            # delimiter
1343            delimiter = file_format_delimiters.get(input_format, "\t")
1344
1345            # Load the input file
1346            with open(self.input, "rt") as input_file:
1347
1348                # Use the appropriate file handler based on the input format
1349                if input_compressed:
1350                    input_file = bgzf.open(self.input, "rt")
1351                if input_format in ["vcf"]:
1352                    header_len = self.get_header_length()
1353                else:
1354                    header_len = 0
1355
1356                # Insert the file contents into a table
1357                self.insert_file_to_table(
1358                    input_file,
1359                    columns=sql_create_table_columns_list_sql,
1360                    header_len=header_len,
1361                    sep=delimiter,
1362                    chunksize=chunksize,
1363                )
1364
1365        else:
1366            log.error(
1367                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
1368            )
1369            raise ValueError(
1370                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
1371            )
1372
1373        # Explode INFOS fields into table fields
1374        if self.get_explode_infos():
1375            self.explode_infos(
1376                prefix=self.get_explode_infos_prefix(),
1377                fields=self.get_explode_infos_fields(),
1378                force=True,
1379            )
1380
1381        # Create index after insertion
1382        self.create_indexes()
1383
1384    def get_explode_infos(self) -> bool:
1385        """
1386        The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting
1387        to False if it is not set.
1388        :return: The method is returning the value of the "explode_infos" parameter, which is a boolean
1389        value. If the parameter is not present, it will return False.
1390        """
1391
1392        return self.get_param().get("explode", {}).get("explode_infos", False)
1393
1394    def get_explode_infos_fields(
1395        self,
1396        explode_infos_fields: str = None,
1397        remove_fields_not_in_header: bool = False,
1398    ) -> list:
1399        """
1400        The `get_explode_infos_fields` function returns a list of exploded information fields based on
1401        the input parameter `explode_infos_fields`.
1402
1403        :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the
1404        fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a
1405        comma-separated list of field names to explode
1406        :type explode_infos_fields: str
1407        :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean
1408        flag that determines whether to remove fields that are not present in the header. If it is set
1409        to `True`, any field that is not in the header will be excluded from the list of exploded
1410        information fields. If it is set to `, defaults to False
1411        :type remove_fields_not_in_header: bool (optional)
1412        :return: The function `get_explode_infos_fields` returns a list of exploded information fields.
1413        If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty
1414        list. If the parameter is provided and its value is "ALL", it also returns an empty list.
1415        Otherwise, it returns a list of exploded information fields after removing any spaces and
1416        splitting the string by commas.
1417        """
1418
1419        # If no fields, get it in param
1420        if not explode_infos_fields:
1421            explode_infos_fields = (
1422                self.get_param().get("explode", {}).get("explode_infos_fields", None)
1423            )
1424
1425        # If no fields, defined as all fields in header using keyword
1426        if not explode_infos_fields:
1427            explode_infos_fields = "*"
1428
1429        # If fields list not empty
1430        if explode_infos_fields:
1431
1432            # Input fields list
1433            if isinstance(explode_infos_fields, str):
1434                fields_input = explode_infos_fields.split(",")
1435            elif isinstance(explode_infos_fields, list):
1436                fields_input = explode_infos_fields
1437            else:
1438                fields_input = []
1439
1440            # Fields list without * keyword
1441            fields_without_all = fields_input.copy()
1442            if "*".casefold() in (item.casefold() for item in fields_without_all):
1443                fields_without_all.remove("*")
1444
1445            # Fields in header
1446            fields_in_header = sorted(list(set(self.get_header().infos)))
1447
1448            # Construct list of fields
1449            fields_output = []
1450            for field in fields_input:
1451
1452                # Strip field
1453                field = field.strip()
1454
1455                # format keyword * in regex
1456                if field.upper() in ["*"]:
1457                    field = ".*"
1458
1459                # Find all fields with pattern
1460                r = re.compile(field)
1461                fields_search = sorted(list(filter(r.match, fields_in_header)))
1462
1463                # Remove fields input from search
1464                if fields_search != [field]:
1465                    fields_search = sorted(
1466                        list(set(fields_search).difference(fields_input))
1467                    )
1468
1469                # If field is not in header (avoid not well formatted header)
1470                if not fields_search and not remove_fields_not_in_header:
1471                    fields_search = [field]
1472
1473                # Add found fields
1474                for new_field in fields_search:
1475                    # Add field, if not already exists, and if it is in header (if asked)
1476                    if (
1477                        new_field not in fields_output
1478                        and (
1479                            not remove_fields_not_in_header
1480                            or new_field in fields_in_header
1481                        )
1482                        and new_field not in [".*"]
1483                    ):
1484                        fields_output.append(new_field)
1485
1486            return fields_output
1487
1488        else:
1489
1490            return []
1491
1492    def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str:
1493        """
1494        The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or
1495        the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is
1496        not provided.
1497
1498        :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a
1499        prefix to be used for exploding or expanding information
1500        :type explode_infos_prefix: str
1501        :return: the value of the variable `explode_infos_prefix`.
1502        """
1503
1504        if not explode_infos_prefix:
1505            explode_infos_prefix = (
1506                self.get_param().get("explode", {}).get("explode_infos_prefix", "")
1507            )
1508
1509        return explode_infos_prefix
1510
1511    def add_column(
1512        self,
1513        table_name,
1514        column_name,
1515        column_type,
1516        default_value=None,
1517        drop: bool = False,
1518    ) -> dict:
1519        """
1520        The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it
1521        doesn't already exist.
1522
1523        :param table_name: The name of the table to which you want to add a column
1524        :param column_name: The parameter "column_name" is the name of the column that you want to add
1525        to the table
1526        :param column_type: The `column_type` parameter specifies the data type of the column that you
1527        want to add to the table. It should be a string that represents the desired data type, such as
1528        "INTEGER", "TEXT", "REAL", etc
1529        :param default_value: The `default_value` parameter is an optional parameter that specifies the
1530        default value for the newly added column. If a default value is provided, it will be assigned to
1531        the column for any existing rows that do not have a value for that column
1532        :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column
1533        if it already exists in the table. If `drop` is set to `True`, the function will drop the
1534        existing column before adding the new column. If `drop` is set to `False` (default),, defaults
1535        to False
1536        :type drop: bool (optional)
1537        :return: a boolean value indicating whether the column was successfully added to the table.
1538        """
1539
1540        # added
1541        added = False
1542        dropped = False
1543
1544        # Check if the column already exists in the table
1545        query = f""" SELECT * FROM {table_name} LIMIT 0 """
1546        columns = self.get_query_to_df(query).columns.tolist()
1547        if column_name in columns:
1548            log.debug(
1549                f"The {column_name} column already exists in the {table_name} table"
1550            )
1551            if drop:
1552                self.drop_column(table_name=table_name, column_name=column_name)
1553                dropped = True
1554            else:
1555                return None
1556        else:
1557            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
1558
1559        # Add column in table
1560        add_column_query = (
1561            f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """
1562        )
1563        if default_value is not None:
1564            add_column_query += f" DEFAULT {default_value}"
1565        self.execute_query(add_column_query)
1566        added = not dropped
1567        log.debug(
1568            f"The {column_name} column was successfully added to the {table_name} table"
1569        )
1570
1571        if added:
1572            added_column = {
1573                "table_name": table_name,
1574                "column_name": column_name,
1575                "column_type": column_type,
1576                "default_value": default_value,
1577            }
1578        else:
1579            added_column = None
1580
1581        return added_column
1582
1583    def drop_column(
1584        self, column: dict = None, table_name: str = None, column_name: str = None
1585    ) -> bool:
1586        """
1587        The `drop_column` function drops a specified column from a given table in a database and returns
1588        True if the column was successfully dropped, and False if the column does not exist in the
1589        table.
1590
1591        :param column: The `column` parameter is a dictionary that contains information about the column
1592        you want to drop. It has two keys:
1593        :type column: dict
1594        :param table_name: The `table_name` parameter is the name of the table from which you want to
1595        drop a column
1596        :type table_name: str
1597        :param column_name: The `column_name` parameter is the name of the column that you want to drop
1598        from the table
1599        :type column_name: str
1600        :return: a boolean value. It returns True if the column was successfully dropped from the table,
1601        and False if the column does not exist in the table.
1602        """
1603
1604        # Find column infos
1605        if column:
1606            if isinstance(column, dict):
1607                table_name = column.get("table_name", None)
1608                column_name = column.get("column_name", None)
1609            elif isinstance(column, str):
1610                table_name = self.get_table_variants()
1611                column_name = column
1612            else:
1613                table_name = None
1614                column_name = None
1615
1616        if not table_name and not column_name:
1617            return False
1618
1619        # Removed
1620        removed = False
1621
1622        # Check if the column already exists in the table
1623        query = f""" SELECT * FROM {table_name} LIMIT 0 """
1624        columns = self.get_query_to_df(query).columns.tolist()
1625        if column_name in columns:
1626            log.debug(f"The {column_name} column exists in the {table_name} table")
1627        else:
1628            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
1629            return False
1630
1631        # Add column in table # ALTER TABLE integers DROP k
1632        add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """
1633        self.execute_query(add_column_query)
1634        removed = True
1635        log.debug(
1636            f"The {column_name} column was successfully dropped to the {table_name} table"
1637        )
1638
1639        return removed
1640
1641    def explode_infos(
1642        self,
1643        prefix: str = None,
1644        create_index: bool = False,
1645        fields: list = None,
1646        force: bool = False,
1647        proccess_all_fields_together: bool = False,
1648    ) -> list:
1649        """
1650        The `explode_infos` function takes a VCF file and explodes the INFO fields into individual
1651        columns, returning a list of added columns.
1652
1653        :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO
1654        fields. If the `prefix` is not provided or is set to `None`, the function will use the value of
1655        `self.get_explode_infos_prefix()` as the prefix
1656        :type prefix: str
1657        :param create_index: The `create_index` parameter is a boolean flag that specifies whether to
1658        create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to
1659        `False`, indexes will not be created. The default value is `False`, defaults to False
1660        :type create_index: bool (optional)
1661        :param fields: The `fields` parameter is a list of INFO fields that you want to explode into
1662        individual columns. If this parameter is not provided, all INFO fields will be exploded
1663        :type fields: list
1664        :param force: The `force` parameter is a boolean flag that determines whether to drop and
1665        recreate the column if it already exists in the table. If `force` is set to `True`, the column
1666        will be dropped and recreated. If `force` is set to `False`, the column will not be dropped,
1667        defaults to False
1668        :type force: bool (optional)
1669        :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean
1670        flag that determines whether to process all the INFO fields together or individually. If set to
1671        `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will
1672        be processed individually, defaults to False
1673        :type proccess_all_fields_together: bool (optional)
1674        :return: The function `explode_infos` returns a list of added columns.
1675        """
1676
1677        # drop indexes
1678        self.drop_indexes()
1679
1680        # connexion format
1681        connexion_format = self.get_connexion_format()
1682
1683        # Access
1684        access = self.get_config().get("access", None)
1685
1686        # Added columns
1687        added_columns = []
1688
1689        if access not in ["RO"]:
1690
1691            # prefix
1692            if prefix in [None, True] or not isinstance(prefix, str):
1693                if self.get_explode_infos_prefix() not in [None, True]:
1694                    prefix = self.get_explode_infos_prefix()
1695                else:
1696                    prefix = "INFO/"
1697
1698            # table variants
1699            table_variants = self.get_table_variants(clause="select")
1700
1701            # extra infos
1702            try:
1703                extra_infos = self.get_extra_infos()
1704            except:
1705                extra_infos = []
1706
1707            # Header infos
1708            header_infos = self.get_header().infos
1709
1710            log.debug(
1711                f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields"
1712            )
1713
1714            sql_info_alter_table_array = []
1715
1716            # Info fields to check
1717            fields_list = list(header_infos)
1718            if fields:
1719                fields_list += fields
1720            fields_list = set(fields_list)
1721
1722            # If no fields
1723            if not fields:
1724                fields = []
1725
1726            # Translate fields if patterns
1727            fields = self.get_explode_infos_fields(explode_infos_fields=fields)
1728
1729            for info in fields:
1730
1731                info_id_sql = prefix + info
1732
1733                if (
1734                    info in fields_list
1735                    or prefix + info in fields_list
1736                    or info in extra_infos
1737                ):
1738
1739                    log.debug(f"Explode INFO fields - ADD '{info}' annotations fields")
1740
1741                    if info in header_infos:
1742                        info_type = header_infos[info].type
1743                        info_num = header_infos[info].num
1744                    else:
1745                        info_type = "String"
1746                        info_num = 0
1747
1748                    type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR")
1749                    if info_num != 1:
1750                        type_sql = "VARCHAR"
1751
1752                    # Add field
1753                    added_column = self.add_column(
1754                        table_name=table_variants,
1755                        column_name=info_id_sql,
1756                        column_type=type_sql,
1757                        default_value="null",
1758                        drop=force,
1759                    )
1760
1761                    if added_column:
1762                        added_columns.append(added_column)
1763
1764                    if added_column or force:
1765
1766                        # add field to index
1767                        self.index_additionnal_fields.append(info_id_sql)
1768
1769                        # Update field array
1770                        if connexion_format in ["duckdb"]:
1771                            update_info_field = f"""
1772                            "{info_id_sql}" =
1773                                CASE
1774                                    WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL
1775                                    ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1)
1776                                END
1777                            """
1778                        elif connexion_format in ["sqlite"]:
1779                            update_info_field = f"""
1780                                "{info_id_sql}" =
1781                                    CASE
1782                                        WHEN instr(INFO, '{info}=') = 0 THEN NULL
1783                                        WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1)
1784                                        ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1)
1785                                    END
1786                            """
1787
1788                        sql_info_alter_table_array.append(update_info_field)
1789
1790            if sql_info_alter_table_array:
1791
1792                # By chromosomes
1793                try:
1794                    chromosomes_list = list(
1795                        self.get_query_to_df(
1796                            f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """
1797                        )["#CHROM"]
1798                    )
1799                except:
1800                    chromosomes_list = [None]
1801
1802                for chrom in chromosomes_list:
1803                    log.debug(f"Explode INFO fields - Chromosome {chrom}...")
1804
1805                    # Where clause
1806                    where_clause = ""
1807                    if chrom and len(chromosomes_list) > 1:
1808                        where_clause = f""" WHERE "#CHROM" = '{chrom}' """
1809
1810                    # Update table
1811                    if proccess_all_fields_together:
1812                        sql_info_alter_table_array_join = ", ".join(
1813                            sql_info_alter_table_array
1814                        )
1815                        if sql_info_alter_table_array_join:
1816                            sql_info_alter_table = f"""
1817                                UPDATE {table_variants}
1818                                SET {sql_info_alter_table_array_join}
1819                                {where_clause}
1820                                """
1821                            log.debug(
1822                                f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..."
1823                            )
1824                            # log.debug(sql_info_alter_table)
1825                            self.conn.execute(sql_info_alter_table)
1826                    else:
1827                        sql_info_alter_num = 0
1828                        for sql_info_alter in sql_info_alter_table_array:
1829                            sql_info_alter_num += 1
1830                            sql_info_alter_table = f"""
1831                                UPDATE {table_variants}
1832                                SET {sql_info_alter}
1833                                {where_clause}
1834                                """
1835                            log.debug(
1836                                f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..."
1837                            )
1838                            # log.debug(sql_info_alter_table)
1839                            self.conn.execute(sql_info_alter_table)
1840
1841        # create indexes
1842        if create_index:
1843            self.create_indexes()
1844
1845        return added_columns
1846
1847    def create_indexes(self) -> None:
1848        """
1849        Create indexes on the table after insertion
1850        """
1851
1852        # Access
1853        access = self.get_config().get("access", None)
1854
1855        # get table variants
1856        table_variants = self.get_table_variants("FROM")
1857
1858        if self.get_indexing() and access not in ["RO"]:
1859            # Create index
1860            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")'
1861            self.conn.execute(sql_create_table_index)
1862            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")'
1863            self.conn.execute(sql_create_table_index)
1864            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")'
1865            self.conn.execute(sql_create_table_index)
1866            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")'
1867            self.conn.execute(sql_create_table_index)
1868            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")'
1869            self.conn.execute(sql_create_table_index)
1870            for field in self.index_additionnal_fields:
1871                sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """
1872                self.conn.execute(sql_create_table_index)
1873
1874    def drop_indexes(self) -> None:
1875        """
1876        Create indexes on the table after insertion
1877        """
1878
1879        # Access
1880        access = self.get_config().get("access", None)
1881
1882        # get table variants
1883        table_variants = self.get_table_variants("FROM")
1884
1885        # Get database format
1886        connexion_format = self.get_connexion_format()
1887
1888        if access not in ["RO"]:
1889            if connexion_format in ["duckdb"]:
1890                sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'"
1891            elif connexion_format in ["sqlite"]:
1892                sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';"
1893
1894            list_indexes = self.conn.execute(sql_list_indexes)
1895            index_names = [row[0] for row in list_indexes.fetchall()]
1896            for index in index_names:
1897                sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """
1898                self.conn.execute(sql_drop_table_index)
1899
1900    def read_vcf_header(self, f) -> list:
1901        """
1902        It reads the header of a VCF file and returns a list of the header lines
1903
1904        :param f: the file object
1905        :return: The header lines of the VCF file.
1906        """
1907
1908        header_list = []
1909        for line in f:
1910            header_list.append(line)
1911            if line.startswith("#CHROM"):
1912                break
1913        return header_list
1914
1915    def read_vcf_header_file(self, file: str = None) -> list:
1916        """
1917        The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and
1918        uncompressed files.
1919
1920        :param file: The `file` parameter is a string that represents the path to the VCF header file
1921        that you want to read. It is an optional parameter, so if you don't provide a value, it will
1922        default to `None`
1923        :type file: str
1924        :return: The function `read_vcf_header_file` returns a list.
1925        """
1926
1927        if self.get_input_compressed(input_file=file):
1928            with bgzf.open(file, "rt") as f:
1929                return self.read_vcf_header(f=f)
1930        else:
1931            with open(file, "rt") as f:
1932                return self.read_vcf_header(f=f)
1933
1934    def execute_query(self, query: str):
1935        """
1936        It takes a query as an argument, executes it, and returns the results
1937
1938        :param query: The query to be executed
1939        :return: The result of the query is being returned.
1940        """
1941        if query:
1942            return self.conn.execute(query)  # .fetchall()
1943        else:
1944            return None
1945
1946    def export_output(
1947        self,
1948        output_file: str | None = None,
1949        output_header: str | None = None,
1950        export_header: bool = True,
1951        query: str | None = None,
1952        parquet_partitions: list | None = None,
1953        chunk_size: int | None = None,
1954        threads: int | None = None,
1955        sort: bool = False,
1956        index: bool = False,
1957        order_by: str | None = None,
1958    ) -> bool:
1959        """
1960        The `export_output` function exports data from a VCF file to a specified output file in various
1961        formats, including VCF, CSV, TSV, PSV, and Parquet.
1962
1963        :param output_file: The `output_file` parameter is a string that specifies the name of the
1964        output file to be generated by the function. This is where the exported data will be saved
1965        :type output_file: str
1966        :param output_header: The `output_header` parameter is a string that specifies the name of the
1967        file where the header of the VCF file will be exported. If this parameter is not provided, the
1968        header will be exported to a file with the same name as the `output_file` parameter, but with
1969        the extension "
1970        :type output_header: str
1971        :param export_header: The `export_header` parameter is a boolean flag that determines whether
1972        the header of a VCF file should be exported to a separate file or not. If `export_header` is
1973        True, the header will be exported to a file. If `export_header` is False, the header will not
1974        be, defaults to True, if output format is not VCF
1975        :type export_header: bool (optional)
1976        :param query: The `query` parameter is an optional SQL query that can be used to filter and
1977        select specific data from the VCF file before exporting it. If provided, only the data that
1978        matches the query will be exported
1979        :type query: str
1980        :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the
1981        columns to be used for partitioning the Parquet file during export. Partitioning is a way to
1982        organize data in a hierarchical directory structure based on the values of one or more columns.
1983        This can improve query performance when working with large datasets
1984        :type parquet_partitions: list
1985        :param chunk_size: The `chunk_size` parameter specifies the number of
1986        records in batch when exporting data in Parquet format. This parameter is used for
1987        partitioning the Parquet file into multiple files.
1988        :type chunk_size: int
1989        :param threads: The `threads` parameter is an optional parameter that specifies the number of
1990        threads to be used during the export process. It determines the level of parallelism and can
1991        improve the performance of the export operation. If not provided, the function will use the
1992        default number of threads
1993        :type threads: int
1994        :param sort: The `sort` parameter is a boolean flag that determines whether the output file
1995        should be sorted or not. If `sort` is set to `True`, the output file will be sorted based on the
1996        genomic coordinates of the variants. By default, the value of `sort` is `False`, defaults to
1997        False
1998        :type sort: bool (optional)
1999        :param index: The `index` parameter is a boolean flag that determines whether an index should be
2000        created on the output file. If `index` is True, an index will be created. If `index` is False,
2001        no index will be created. The default value is False, defaults to False
2002        :type index: bool (optional)
2003        :param order_by: The `order_by` parameter is a string that specifies the column(s) to use for
2004        sorting the output file. This parameter is only applicable when exporting data in VCF format
2005        :type order_by: str
2006        :return: a boolean value. It checks if the output file exists and returns True if it does, or
2007        None if it doesn't.
2008        """
2009
2010        # Log
2011        log.info("Exporting...")
2012
2013        # Full path
2014        output_file = full_path(output_file)
2015        output_header = full_path(output_header)
2016
2017        # Config
2018        config = self.get_config()
2019
2020        # Param
2021        param = self.get_param()
2022
2023        # Tmp files to remove
2024        tmp_to_remove = []
2025
2026        # If no output, get it
2027        if not output_file:
2028            output_file = self.get_output()
2029
2030        # If not threads
2031        if not threads:
2032            threads = self.get_threads()
2033
2034        # Auto header name with extension
2035        if export_header or output_header:
2036            if not output_header:
2037                output_header = f"{output_file}.hdr"
2038            # Export header
2039            self.export_header(output_file=output_file)
2040
2041        # Switch off export header if VCF output
2042        output_file_type = get_file_format(output_file)
2043        if output_file_type in ["vcf"]:
2044            export_header = False
2045            tmp_to_remove.append(output_header)
2046
2047        # Chunk size
2048        if not chunk_size:
2049            chunk_size = config.get("chunk_size", None)
2050
2051        # Parquet partition
2052        if not parquet_partitions:
2053            parquet_partitions = param.get("export", {}).get("parquet_partitions", None)
2054        if parquet_partitions and isinstance(parquet_partitions, str):
2055            parquet_partitions = parquet_partitions.split(",")
2056
2057        # Order by
2058        if not order_by:
2059            order_by = param.get("export", {}).get("order_by", "")
2060
2061        # Header in output
2062        header_in_output = param.get("export", {}).get("include_header", False)
2063
2064        # Database
2065        database_source = self.get_connexion()
2066
2067        # Connexion format
2068        connexion_format = self.get_connexion_format()
2069
2070        # Explode infos
2071        if self.get_explode_infos():
2072            self.explode_infos(
2073                prefix=self.get_explode_infos_prefix(),
2074                fields=self.get_explode_infos_fields(),
2075                force=False,
2076            )
2077
2078        # if connexion_format in ["sqlite"] or query:
2079        if connexion_format in ["sqlite"]:
2080
2081            # Export in Parquet
2082            random_tmp = "".join(
2083                random.choice(string.ascii_lowercase) for i in range(10)
2084            )
2085            database_source = f"""{output_file}.{random_tmp}.database_export.parquet"""
2086            tmp_to_remove.append(database_source)
2087
2088            # Table Variants
2089            table_variants = self.get_table_variants()
2090
2091            # Create export query
2092            sql_query_export_subquery = f"""
2093                SELECT * FROM {table_variants}
2094                """
2095
2096            # Write source file
2097            fp.write(database_source, self.get_query_to_df(sql_query_export_subquery))
2098
2099        # Create database
2100        database = Database(
2101            database=database_source,
2102            table="variants",
2103            header_file=output_header,
2104            conn_config=self.get_connexion_config(),
2105        )
2106
2107        # Existing colomns header
2108        # existing_columns_header = database.get_header_file_columns(output_header)
2109        existing_columns_header = database.get_header_columns_from_database()
2110
2111        # Export file
2112        database.export(
2113            output_database=output_file,
2114            output_header=output_header,
2115            existing_columns_header=existing_columns_header,
2116            parquet_partitions=parquet_partitions,
2117            chunk_size=chunk_size,
2118            threads=threads,
2119            sort=sort,
2120            index=index,
2121            header_in_output=header_in_output,
2122            order_by=order_by,
2123            query=query,
2124            export_header=export_header,
2125        )
2126
2127        # Remove
2128        remove_if_exists(tmp_to_remove)
2129
2130        return (os.path.exists(output_file) or None) and (
2131            os.path.exists(output_file) or None
2132        )
2133
2134    def get_extra_infos(self, table: str = None) -> list:
2135        """
2136        The `get_extra_infos` function returns a list of columns that are in a specified table but not
2137        in the header.
2138
2139        :param table: The `table` parameter in the `get_extra_infos` function is used to specify the
2140        name of the table from which you want to retrieve the extra columns that are not present in the
2141        header. If the `table` parameter is not provided when calling the function, it will default to
2142        using the variants
2143        :type table: str
2144        :return: A list of columns that are in the specified table but not in the header of the table.
2145        """
2146
2147        header_columns = []
2148
2149        if not table:
2150            table = self.get_table_variants(clause="from")
2151            header_columns = self.get_header_columns()
2152
2153        # Check all columns in the database
2154        query = f""" SELECT * FROM {table} LIMIT 1 """
2155        log.debug(f"query {query}")
2156        table_columns = self.get_query_to_df(query).columns.tolist()
2157        extra_columns = []
2158
2159        # Construct extra infos (not in header)
2160        for column in table_columns:
2161            if column not in header_columns:
2162                extra_columns.append(column)
2163
2164        return extra_columns
2165
2166    def get_extra_infos_sql(self, table: str = None) -> str:
2167        """
2168        It returns a string of the extra infos, separated by commas, and each extra info is surrounded
2169        by double quotes
2170
2171        :param table: The name of the table to get the extra infos from. If None, the default table is
2172        used
2173        :type table: str
2174        :return: A string of the extra infos
2175        """
2176
2177        return ", ".join(
2178            ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)]
2179        )
2180
2181    def export_header(
2182        self,
2183        header_name: str = None,
2184        output_file: str = None,
2185        output_file_ext: str = ".hdr",
2186        clean_header: bool = True,
2187        remove_chrom_line: bool = False,
2188    ) -> str:
2189        """
2190        The `export_header` function takes a VCF file, extracts the header, modifies it according to
2191        specified options, and writes it to a new file.
2192
2193        :param header_name: The `header_name` parameter is the name of the header file to be created. If
2194        this parameter is not specified, the header will be written to the output file
2195        :type header_name: str
2196        :param output_file: The `output_file` parameter in the `export_header` function is used to
2197        specify the name of the output file where the header will be written. If this parameter is not
2198        provided, the header will be written to a temporary file
2199        :type output_file: str
2200        :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a
2201        string that represents the extension of the output header file. By default, it is set to ".hdr"
2202        if not specified by the user. This extension will be appended to the `output_file` name to
2203        create the final, defaults to .hdr
2204        :type output_file_ext: str (optional)
2205        :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean
2206        flag that determines whether the header should be cleaned or not. When `clean_header` is set to
2207        `True`, the function will clean the header by modifying certain lines based on a specific
2208        pattern. If `clean_header`, defaults to True
2209        :type clean_header: bool (optional)
2210        :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a
2211        boolean flag that determines whether the #CHROM line should be removed from the header before
2212        writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `,
2213        defaults to False
2214        :type remove_chrom_line: bool (optional)
2215        :return: The function `export_header` returns the name of the temporary header file that is
2216        created.
2217        """
2218
2219        if not header_name and not output_file:
2220            output_file = self.get_output()
2221
2222        if self.get_header():
2223
2224            # Get header object
2225            header_obj = self.get_header()
2226
2227            # Create database
2228            db_for_header = Database(database=self.get_input())
2229
2230            # Get real columns in the file
2231            db_header_columns = db_for_header.get_columns()
2232
2233            with tempfile.TemporaryDirectory() as tmpdir:
2234
2235                # Write header file
2236                header_file_tmp = os.path.join(tmpdir, "header")
2237                f = open(header_file_tmp, "w")
2238                vcf.Writer(f, header_obj)
2239                f.close()
2240
2241                # Replace #CHROM line with rel columns
2242                header_list = db_for_header.read_header_file(
2243                    header_file=header_file_tmp
2244                )
2245                header_list[-1] = "\t".join(db_header_columns)
2246
2247                # Remove CHROM line
2248                if remove_chrom_line:
2249                    header_list.pop()
2250
2251                # Clean header
2252                if clean_header:
2253                    header_list_clean = []
2254                    for head in header_list:
2255                        # Clean head for malformed header
2256                        head_clean = head
2257                        head_clean = re.subn(
2258                            "##FORMAT=<ID=(.*),Number=(.*),Type=Flag",
2259                            r"##FORMAT=<ID=\1,Number=\2,Type=String",
2260                            head_clean,
2261                            2,
2262                        )[0]
2263                        # Write header
2264                        header_list_clean.append(head_clean)
2265                    header_list = header_list_clean
2266
2267            tmp_header_name = output_file + output_file_ext
2268
2269            f = open(tmp_header_name, "w")
2270            for line in header_list:
2271                f.write(line)
2272            f.close()
2273
2274        return tmp_header_name
2275
2276    def export_variant_vcf(
2277        self,
2278        vcf_file,
2279        remove_info: bool = False,
2280        add_samples: bool = True,
2281        list_samples: list = [],
2282        where_clause: str = "",
2283        index: bool = False,
2284        threads: int | None = None,
2285    ) -> bool | None:
2286        """
2287        The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to
2288        remove INFO field, add samples, and control compression and indexing.
2289
2290        :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be
2291        written to. It is the output file that will contain the filtered VCF data based on the specified
2292        parameters
2293        :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a
2294        boolean flag that determines whether to remove the INFO field from the output VCF file. If set
2295        to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included
2296        in, defaults to False
2297        :type remove_info: bool (optional)
2298        :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether
2299        the samples should be added to the VCF file or not. If set to True, the samples will be added.
2300        If set to False, the samples will be removed. The default value is True, defaults to True
2301        :type add_samples: bool (optional)
2302        :param list_samples: The `list_samples` parameter is a list of samples that you want to include
2303        in the output VCF file. By default, all samples will be included. If you provide a list of
2304        samples, only those samples will be included in the output file
2305        :type list_samples: list
2306        :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that
2307        determines whether or not to create an index for the output VCF file. If `index` is set to
2308        `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False
2309        :type index: bool (optional)
2310        :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the
2311        number of threads to use for exporting the VCF file. It determines how many parallel threads
2312        will be used during the export process. More threads can potentially speed up the export process
2313        by utilizing multiple cores of the processor. If
2314        :type threads: int | None
2315        :return: The `export_variant_vcf` function returns the result of calling the `export_output`
2316        method with various parameters including the output file, query, threads, sort flag, and index
2317        flag. The `export_output` method is responsible for exporting the VCF data based on the
2318        specified parameters and configurations provided in the `export_variant_vcf` function.
2319        """
2320
2321        # Config
2322        config = self.get_config()
2323
2324        # Extract VCF
2325        log.debug("Export VCF...")
2326
2327        # Table variants
2328        table_variants = self.get_table_variants()
2329
2330        # Threads
2331        if not threads:
2332            threads = self.get_threads()
2333
2334        # Info fields
2335        if remove_info:
2336            if not isinstance(remove_info, str):
2337                remove_info = "."
2338            info_field = f"""'{remove_info}' as INFO"""
2339        else:
2340            info_field = "INFO"
2341
2342        # Samples fields
2343        if add_samples:
2344            if not list_samples:
2345                list_samples = self.get_header_sample_list()
2346            if list_samples:
2347                samples_fields = " , FORMAT , " + " , ".join(list_samples)
2348            else:
2349                samples_fields = ""
2350            log.debug(f"samples_fields: {samples_fields}")
2351        else:
2352            samples_fields = ""
2353
2354        # Where clause
2355        if where_clause is None:
2356            where_clause = ""
2357
2358        # Variants
2359        select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """
2360        sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """
2361        log.debug(f"sql_query_select={sql_query_select}")
2362
2363        return self.export_output(
2364            output_file=vcf_file,
2365            output_header=None,
2366            export_header=True,
2367            query=sql_query_select,
2368            parquet_partitions=None,
2369            chunk_size=config.get("chunk_size", None),
2370            threads=threads,
2371            sort=True,
2372            index=index,
2373            order_by=None,
2374        )
2375
2376    def run_commands(self, commands: list = [], threads: int = 1) -> None:
2377        """
2378        It takes a list of commands and runs them in parallel using the number of threads specified
2379
2380        :param commands: A list of commands to run
2381        :param threads: The number of threads to use, defaults to 1 (optional)
2382        """
2383
2384        run_parallel_commands(commands, threads)
2385
2386    def get_threads(self, default: int = 1) -> int:
2387        """
2388        This function returns the number of threads to use for a job, with a default value of 1 if not
2389        specified.
2390
2391        :param default: The `default` parameter in the `get_threads` method is used to specify the
2392        default number of threads to use if no specific value is provided. If no value is provided for
2393        the `threads` parameter in the configuration or input parameters, the `default` value will be
2394        used, defaults to 1
2395        :type default: int (optional)
2396        :return: the number of threads to use for the current job.
2397        """
2398
2399        # Config
2400        config = self.get_config()
2401
2402        # Param
2403        param = self.get_param()
2404
2405        # Input threads
2406        input_thread = param.get("threads", config.get("threads", None))
2407
2408        # Check threads
2409        if not input_thread:
2410            threads = default
2411        elif int(input_thread) <= 0:
2412            threads = os.cpu_count()
2413        else:
2414            threads = int(input_thread)
2415        return threads
2416
2417    def get_memory(self, default: str = None) -> str:
2418        """
2419        This function retrieves the memory value from parameters or configuration with a default value
2420        if not found.
2421
2422        :param default: The `get_memory` function takes in a default value as a string parameter. This
2423        default value is used as a fallback in case the `memory` parameter is not provided in the
2424        `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary,
2425        the function
2426        :type default: str
2427        :return: The `get_memory` function returns a string value representing the memory parameter. If
2428        the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will
2429        return the default value provided as an argument to the function.
2430        """
2431
2432        # Config
2433        config = self.get_config()
2434
2435        # Param
2436        param = self.get_param()
2437
2438        # Input threads
2439        input_memory = param.get("memory", config.get("memory", None))
2440
2441        # Check threads
2442        if input_memory:
2443            memory = input_memory
2444        else:
2445            memory = default
2446
2447        return memory
2448
2449    def update_from_vcf(self, vcf_file: str) -> None:
2450        """
2451        > If the database is duckdb, then use the parquet method, otherwise use the sqlite method
2452
2453        :param vcf_file: the path to the VCF file
2454        """
2455
2456        connexion_format = self.get_connexion_format()
2457
2458        if connexion_format in ["duckdb"]:
2459            self.update_from_vcf_duckdb(vcf_file)
2460        elif connexion_format in ["sqlite"]:
2461            self.update_from_vcf_sqlite(vcf_file)
2462
2463    def update_from_vcf_duckdb(self, vcf_file: str) -> None:
2464        """
2465        It takes a VCF file and updates the INFO column of the variants table in the database with the
2466        INFO column of the VCF file
2467
2468        :param vcf_file: the path to the VCF file
2469        """
2470
2471        # varaints table
2472        table_variants = self.get_table_variants()
2473
2474        # Loading VCF into temporaire table
2475        skip = self.get_header_length(file=vcf_file)
2476        vcf_df = pd.read_csv(
2477            vcf_file,
2478            sep="\t",
2479            engine="c",
2480            skiprows=skip,
2481            header=0,
2482            low_memory=False,
2483        )
2484        sql_query_update = f"""
2485        UPDATE {table_variants} as table_variants
2486            SET INFO = concat(
2487                            CASE
2488                                WHEN INFO NOT IN ('', '.')
2489                                THEN INFO
2490                                ELSE ''
2491                            END,
2492                            (
2493                                SELECT 
2494                                    concat(
2495                                        CASE
2496                                            WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.')
2497                                            THEN ';'
2498                                            ELSE ''
2499                                        END
2500                                        ,
2501                                        CASE
2502                                            WHEN table_parquet.INFO NOT IN ('','.')
2503                                            THEN table_parquet.INFO
2504                                            ELSE ''
2505                                        END
2506                                    )
2507                                FROM vcf_df as table_parquet
2508                                        WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR)
2509                                        AND table_parquet.\"POS\" = table_variants.\"POS\"
2510                                        AND table_parquet.\"ALT\" = table_variants.\"ALT\"
2511                                        AND table_parquet.\"REF\" = table_variants.\"REF\"
2512                                        AND table_parquet.INFO NOT IN ('','.')
2513                            )
2514                        )
2515            ;
2516            """
2517        self.conn.execute(sql_query_update)
2518
2519    def update_from_vcf_sqlite(self, vcf_file: str) -> None:
2520        """
2521        It creates a temporary table in the SQLite database, loads the VCF file into the temporary
2522        table, then updates the INFO column of the variants table with the INFO column of the temporary
2523        table
2524
2525        :param vcf_file: The path to the VCF file you want to update the database with
2526        """
2527
2528        # Create a temporary table for the VCF
2529        table_vcf = "tmp_vcf"
2530        sql_create = (
2531            f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0"
2532        )
2533        self.conn.execute(sql_create)
2534
2535        # Loading VCF into temporaire table
2536        vcf_df = pd.read_csv(
2537            vcf_file, sep="\t", comment="#", header=None, low_memory=False
2538        )
2539        vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]
2540        vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False)
2541
2542        # Update table 'variants' with VCF data
2543        # warning: CONCAT as || operator
2544        sql_query_update = f"""
2545            UPDATE variants as table_variants
2546            SET INFO = CASE
2547                            WHEN INFO NOT IN ('', '.')
2548                            THEN INFO
2549                            ELSE ''
2550                        END ||
2551                        (
2552                        SELECT 
2553                            CASE 
2554                                WHEN table_variants.INFO NOT IN ('','.') 
2555                                    AND table_vcf.INFO NOT IN ('','.')  
2556                                THEN ';' 
2557                                ELSE '' 
2558                            END || 
2559                            CASE 
2560                                WHEN table_vcf.INFO NOT IN ('','.') 
2561                                THEN table_vcf.INFO 
2562                                ELSE '' 
2563                            END
2564                        FROM {table_vcf} as table_vcf
2565                        WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\"
2566                            AND table_vcf.\"POS\" = table_variants.\"POS\"
2567                            AND table_vcf.\"ALT\" = table_variants.\"ALT\"
2568                            AND table_vcf.\"REF\" = table_variants.\"REF\"
2569                        )
2570        """
2571        self.conn.execute(sql_query_update)
2572
2573        # Drop temporary table
2574        sql_drop = f"DROP TABLE {table_vcf}"
2575        self.conn.execute(sql_drop)
2576
2577    def drop_variants_table(self) -> None:
2578        """
2579        > This function drops the variants table
2580        """
2581
2582        table_variants = self.get_table_variants()
2583        sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}"
2584        self.conn.execute(sql_table_variants)
2585
2586    def set_variant_id(
2587        self, variant_id_column: str = "variant_id", force: bool = None
2588    ) -> str:
2589        """
2590        It adds a column to the variants table called `variant_id` and populates it with a hash of the
2591        `#CHROM`, `POS`, `REF`, and `ALT` columns
2592
2593        :param variant_id_column: The name of the column to be created in the variants table, defaults
2594        to variant_id
2595        :type variant_id_column: str (optional)
2596        :param force: If True, the variant_id column will be created even if it already exists
2597        :type force: bool
2598        :return: The name of the column that contains the variant_id
2599        """
2600
2601        # Assembly
2602        assembly = self.get_param().get(
2603            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
2604        )
2605
2606        # INFO/Tag prefix
2607        prefix = self.get_explode_infos_prefix()
2608
2609        # Explode INFO/SVTYPE
2610        added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"])
2611
2612        # variants table
2613        table_variants = self.get_table_variants()
2614
2615        # variant_id column
2616        if not variant_id_column:
2617            variant_id_column = "variant_id"
2618
2619        # Creta variant_id column
2620        if "variant_id" not in self.get_extra_infos() or force:
2621
2622            # Create column
2623            self.add_column(
2624                table_name=table_variants,
2625                column_name=variant_id_column,
2626                column_type="UBIGINT",
2627                default_value="0",
2628            )
2629
2630            # Update column
2631            self.conn.execute(
2632                f"""
2633                    UPDATE {table_variants}
2634                    SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"')
2635                """
2636            )
2637
2638        # Remove added columns
2639        for added_column in added_columns:
2640            self.drop_column(column=added_column)
2641
2642        # return variant_id column name
2643        return variant_id_column
2644
2645    def get_variant_id_column(
2646        self, variant_id_column: str = "variant_id", force: bool = None
2647    ) -> str:
2648        """
2649        This function returns the variant_id column name
2650
2651        :param variant_id_column: The name of the column in the dataframe that contains the variant IDs,
2652        defaults to variant_id
2653        :type variant_id_column: str (optional)
2654        :param force: If True, will force the variant_id to be set to the value of variant_id_column. If
2655        False, will only set the variant_id if it is not already set. If None, will set the variant_id
2656        if it is not already set, or if it is set
2657        :type force: bool
2658        :return: The variant_id column name.
2659        """
2660
2661        return self.set_variant_id(variant_id_column=variant_id_column, force=force)
2662
2663    ###
2664    # Annotation
2665    ###
2666
2667    def scan_databases(
2668        self,
2669        database_formats: list = ["parquet"],
2670        database_releases: list = ["current"],
2671    ) -> dict:
2672        """
2673        The function `scan_databases` scans for available databases based on specified formats and
2674        releases.
2675
2676        :param database_formats: The `database_formats` parameter is a list that specifies the formats
2677        of the databases to be scanned. In this case, the accepted format is "parquet"
2678        :type database_formats: list ["parquet"]
2679        :param database_releases: The `database_releases` parameter is a list that specifies the
2680        releases of the databases to be scanned. In the provided function, the default value for
2681        `database_releases` is set to `["current"]`, meaning that by default, the function will scan
2682        databases that are in the "current"
2683        :type database_releases: list
2684        :return: The function `scan_databases` returns a dictionary containing information about
2685        databases that match the specified formats and releases.
2686        """
2687
2688        # Config
2689        config = self.get_config()
2690
2691        # Param
2692        param = self.get_param()
2693
2694        # Param - Assembly
2695        assembly = param.get("assembly", config.get("assembly", None))
2696        if not assembly:
2697            assembly = DEFAULT_ASSEMBLY
2698            log.warning(f"Default assembly '{assembly}'")
2699
2700        # Scan for availabled databases
2701        log.info(
2702            f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..."
2703        )
2704        databases_infos_dict = databases_infos(
2705            database_folder_releases=database_releases,
2706            database_formats=database_formats,
2707            assembly=assembly,
2708            config=config,
2709        )
2710        log.info(
2711            f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found"
2712        )
2713
2714        return databases_infos_dict
2715
2716    def annotation(self) -> None:
2717        """
2718        It annotates the VCF file with the annotations specified in the config file.
2719        """
2720
2721        # Config
2722        config = self.get_config()
2723
2724        # Param
2725        param = self.get_param()
2726
2727        # Param - Assembly
2728        assembly = param.get("assembly", config.get("assembly", None))
2729        if not assembly:
2730            assembly = DEFAULT_ASSEMBLY
2731            log.warning(f"Default assembly '{assembly}'")
2732
2733        # annotations databases folders
2734        annotations_databases = set(
2735            config.get("folders", {})
2736            .get("databases", {})
2737            .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER])
2738            + config.get("folders", {})
2739            .get("databases", {})
2740            .get("parquet", ["~/howard/databases/parquet/current"])
2741            + config.get("folders", {})
2742            .get("databases", {})
2743            .get("bcftools", ["~/howard/databases/bcftools/current"])
2744        )
2745
2746        # Get param annotations
2747        if param.get("annotations", None) and isinstance(
2748            param.get("annotations", None), str
2749        ):
2750            log.debug(param.get("annotations", None))
2751            param_annotation_list = param.get("annotations").split(",")
2752        else:
2753            param_annotation_list = []
2754
2755        # Each tools param
2756        if param.get("annotation_parquet", None) != None:
2757            log.debug(
2758                f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}"""
2759            )
2760            if isinstance(param.get("annotation_parquet", None), list):
2761                param_annotation_list.append(",".join(param.get("annotation_parquet")))
2762            else:
2763                param_annotation_list.append(param.get("annotation_parquet"))
2764        if param.get("annotation_snpsift", None) != None:
2765            if isinstance(param.get("annotation_snpsift", None), list):
2766                param_annotation_list.append(
2767                    "snpsift:"
2768                    + "+".join(param.get("annotation_snpsift")).replace(",", "+")
2769                )
2770            else:
2771                param_annotation_list.append(
2772                    "snpsift:" + param.get("annotation_snpsift").replace(",", "+")
2773                )
2774        if param.get("annotation_snpeff", None) != None:
2775            param_annotation_list.append("snpeff:" + param.get("annotation_snpeff"))
2776        if param.get("annotation_bcftools", None) != None:
2777            if isinstance(param.get("annotation_bcftools", None), list):
2778                param_annotation_list.append(
2779                    "bcftools:"
2780                    + "+".join(param.get("annotation_bcftools")).replace(",", "+")
2781                )
2782            else:
2783                param_annotation_list.append(
2784                    "bcftools:" + param.get("annotation_bcftools").replace(",", "+")
2785                )
2786        if param.get("annotation_annovar", None) != None:
2787            param_annotation_list.append("annovar:" + param.get("annotation_annovar"))
2788        if param.get("annotation_exomiser", None) != None:
2789            param_annotation_list.append("exomiser:" + param.get("annotation_exomiser"))
2790        if param.get("annotation_splice", None) != None:
2791            param_annotation_list.append("splice:" + param.get("annotation_splice"))
2792
2793        # Merge param annotations list
2794        param["annotations"] = ",".join(param_annotation_list)
2795
2796        # debug
2797        log.debug(f"param_annotations={param['annotations']}")
2798
2799        if param.get("annotations"):
2800
2801            # Log
2802            # log.info("Annotations - Check annotation parameters")
2803
2804            if not "annotation" in param:
2805                param["annotation"] = {}
2806
2807            # List of annotations parameters
2808            annotations_list_input = {}
2809            if isinstance(param.get("annotations", None), str):
2810                annotation_file_list = [
2811                    value for value in param.get("annotations", "").split(",")
2812                ]
2813                for annotation_file in annotation_file_list:
2814                    annotations_list_input[annotation_file] = {"INFO": None}
2815            else:
2816                annotations_list_input = param.get("annotations", {})
2817
2818            log.info(f"Quick Annotations:")
2819            for annotation_key in list(annotations_list_input.keys()):
2820                log.info(f"   {annotation_key}")
2821
2822            # List of annotations and associated fields
2823            annotations_list = {}
2824
2825            for annotation_file in annotations_list_input:
2826
2827                # Explode annotations if ALL
2828                if (
2829                    annotation_file.upper() == "ALL"
2830                    or annotation_file.upper().startswith("ALL:")
2831                ):
2832
2833                    # check ALL parameters (formats, releases)
2834                    annotation_file_split = annotation_file.split(":")
2835                    database_formats = "parquet"
2836                    database_releases = "current"
2837                    for annotation_file_option in annotation_file_split[1:]:
2838                        database_all_options_split = annotation_file_option.split("=")
2839                        if database_all_options_split[0] == "format":
2840                            database_formats = database_all_options_split[1].split("+")
2841                        if database_all_options_split[0] == "release":
2842                            database_releases = database_all_options_split[1].split("+")
2843
2844                    # Scan for availabled databases
2845                    databases_infos_dict = self.scan_databases(
2846                        database_formats=database_formats,
2847                        database_releases=database_releases,
2848                    )
2849
2850                    # Add found databases in annotation parameters
2851                    for database_infos in databases_infos_dict.keys():
2852                        annotations_list[database_infos] = {"INFO": None}
2853
2854                else:
2855                    annotations_list[annotation_file] = annotations_list_input[
2856                        annotation_file
2857                    ]
2858
2859            # Check each databases
2860            if len(annotations_list):
2861
2862                log.info(
2863                    f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..."
2864                )
2865
2866                for annotation_file in annotations_list:
2867
2868                    # Init
2869                    annotations = annotations_list.get(annotation_file, None)
2870
2871                    # Annotation snpEff
2872                    if annotation_file.startswith("snpeff"):
2873
2874                        log.debug(f"Quick Annotation snpEff")
2875
2876                        if "snpeff" not in param["annotation"]:
2877                            param["annotation"]["snpeff"] = {}
2878
2879                        if "options" not in param["annotation"]["snpeff"]:
2880                            param["annotation"]["snpeff"]["options"] = ""
2881
2882                        # snpEff options in annotations
2883                        param["annotation"]["snpeff"]["options"] = "".join(
2884                            annotation_file.split(":")[1:]
2885                        )
2886
2887                    # Annotation Annovar
2888                    elif annotation_file.startswith("annovar"):
2889
2890                        log.debug(f"Quick Annotation Annovar")
2891
2892                        if "annovar" not in param["annotation"]:
2893                            param["annotation"]["annovar"] = {}
2894
2895                        if "annotations" not in param["annotation"]["annovar"]:
2896                            param["annotation"]["annovar"]["annotations"] = {}
2897
2898                        # Options
2899                        annotation_file_split = annotation_file.split(":")
2900                        for annotation_file_annotation in annotation_file_split[1:]:
2901                            if annotation_file_annotation:
2902                                param["annotation"]["annovar"]["annotations"][
2903                                    annotation_file_annotation
2904                                ] = annotations
2905
2906                    # Annotation Exomiser
2907                    elif annotation_file.startswith("exomiser"):
2908
2909                        log.debug(f"Quick Annotation Exomiser")
2910
2911                        param["annotation"]["exomiser"] = params_string_to_dict(
2912                            annotation_file
2913                        )
2914
2915                    # Annotation Splice
2916                    elif annotation_file.startswith("splice"):
2917
2918                        log.debug(f"Quick Annotation Splice")
2919
2920                        param["annotation"]["splice"] = params_string_to_dict(
2921                            annotation_file
2922                        )
2923
2924                    # Annotation Parquet or BCFTOOLS
2925                    else:
2926
2927                        # Tools detection
2928                        if annotation_file.startswith("bcftools:"):
2929                            annotation_tool_initial = "bcftools"
2930                            annotation_file = ":".join(annotation_file.split(":")[1:])
2931                        elif annotation_file.startswith("snpsift:"):
2932                            annotation_tool_initial = "snpsift"
2933                            annotation_file = ":".join(annotation_file.split(":")[1:])
2934                        else:
2935                            annotation_tool_initial = None
2936
2937                        # list of files
2938                        annotation_file_list = annotation_file.replace("+", ":").split(
2939                            ":"
2940                        )
2941
2942                        for annotation_file in annotation_file_list:
2943
2944                            if annotation_file:
2945
2946                                # Annotation tool initial
2947                                annotation_tool = annotation_tool_initial
2948
2949                                # Find file
2950                                annotation_file_found = None
2951
2952                                # Expand user
2953                                annotation_file = full_path(annotation_file)
2954
2955                                if os.path.exists(annotation_file):
2956                                    annotation_file_found = annotation_file
2957
2958                                else:
2959                                    # Find within assembly folders
2960                                    for annotations_database in annotations_databases:
2961                                        found_files = find_all(
2962                                            annotation_file,
2963                                            os.path.join(
2964                                                annotations_database, assembly
2965                                            ),
2966                                        )
2967                                        if len(found_files) > 0:
2968                                            annotation_file_found = found_files[0]
2969                                            break
2970                                    if not annotation_file_found and not assembly:
2971                                        # Find within folders
2972                                        for (
2973                                            annotations_database
2974                                        ) in annotations_databases:
2975                                            found_files = find_all(
2976                                                annotation_file, annotations_database
2977                                            )
2978                                            if len(found_files) > 0:
2979                                                annotation_file_found = found_files[0]
2980                                                break
2981                                log.debug(
2982                                    f"for {annotation_file} annotation_file_found={annotation_file_found}"
2983                                )
2984
2985                                # Full path
2986                                annotation_file_found = full_path(annotation_file_found)
2987
2988                                if annotation_file_found:
2989
2990                                    database = Database(database=annotation_file_found)
2991                                    quick_annotation_format = database.get_format()
2992                                    quick_annotation_is_compressed = (
2993                                        database.is_compressed()
2994                                    )
2995                                    quick_annotation_is_indexed = os.path.exists(
2996                                        f"{annotation_file_found}.tbi"
2997                                    )
2998                                    bcftools_preference = False
2999
3000                                    # Check Annotation Tool
3001                                    if not annotation_tool:
3002                                        if (
3003                                            bcftools_preference
3004                                            and quick_annotation_format
3005                                            in ["vcf", "bed"]
3006                                            and quick_annotation_is_compressed
3007                                            and quick_annotation_is_indexed
3008                                        ):
3009                                            annotation_tool = "bcftools"
3010                                        elif quick_annotation_format in [
3011                                            "vcf",
3012                                            "bed",
3013                                            "tsv",
3014                                            "tsv",
3015                                            "csv",
3016                                            "json",
3017                                            "tbl",
3018                                            "parquet",
3019                                            "duckdb",
3020                                        ]:
3021                                            annotation_tool = "parquet"
3022                                        else:
3023                                            log.error(
3024                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
3025                                            )
3026                                            raise ValueError(
3027                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
3028                                            )
3029
3030                                    log.debug(
3031                                        f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}"
3032                                    )
3033
3034                                    # Annotation Tool dispatch
3035                                    if annotation_tool:
3036                                        if annotation_tool not in param["annotation"]:
3037                                            param["annotation"][annotation_tool] = {}
3038                                        if (
3039                                            "annotations"
3040                                            not in param["annotation"][annotation_tool]
3041                                        ):
3042                                            param["annotation"][annotation_tool][
3043                                                "annotations"
3044                                            ] = {}
3045                                        param["annotation"][annotation_tool][
3046                                            "annotations"
3047                                        ][annotation_file_found] = annotations
3048
3049                                else:
3050                                    log.error(
3051                                        f"Quick Annotation File {annotation_file} does NOT exist"
3052                                    )
3053
3054                self.set_param(param)
3055
3056        if param.get("annotation", None):
3057            log.info("Annotations")
3058            if param.get("annotation", {}).get("parquet", None):
3059                log.info("Annotations 'parquet'...")
3060                self.annotation_parquet()
3061            if param.get("annotation", {}).get("bcftools", None):
3062                log.info("Annotations 'bcftools'...")
3063                self.annotation_bcftools()
3064            if param.get("annotation", {}).get("snpsift", None):
3065                log.info("Annotations 'snpsift'...")
3066                self.annotation_snpsift()
3067            if param.get("annotation", {}).get("annovar", None):
3068                log.info("Annotations 'annovar'...")
3069                self.annotation_annovar()
3070            if param.get("annotation", {}).get("snpeff", None):
3071                log.info("Annotations 'snpeff'...")
3072                self.annotation_snpeff()
3073            if param.get("annotation", {}).get("exomiser", None) is not None:
3074                log.info("Annotations 'exomiser'...")
3075                self.annotation_exomiser()
3076            if param.get("annotation", {}).get("splice", None) is not None:
3077                log.info("Annotations 'splice' ...")
3078                self.annotation_splice()
3079
3080        # Explode INFOS fields into table fields
3081        if self.get_explode_infos():
3082            self.explode_infos(
3083                prefix=self.get_explode_infos_prefix(),
3084                fields=self.get_explode_infos_fields(),
3085                force=True,
3086            )
3087
3088    def annotation_snpsift(self, threads: int = None) -> None:
3089        """
3090        This function annotate with bcftools
3091
3092        :param threads: Number of threads to use
3093        :return: the value of the variable "return_value".
3094        """
3095
3096        # DEBUG
3097        log.debug("Start annotation with bcftools databases")
3098
3099        # Threads
3100        if not threads:
3101            threads = self.get_threads()
3102        log.debug("Threads: " + str(threads))
3103
3104        # Config
3105        config = self.get_config()
3106        log.debug("Config: " + str(config))
3107
3108        # Config - snpSift
3109        snpsift_bin_command = get_bin_command(
3110            bin="SnpSift.jar",
3111            tool="snpsift",
3112            bin_type="jar",
3113            config=config,
3114            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
3115        )
3116        if not snpsift_bin_command:
3117            msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'"
3118            log.error(msg_err)
3119            raise ValueError(msg_err)
3120
3121        # Config - bcftools
3122        bcftools_bin_command = get_bin_command(
3123            bin="bcftools",
3124            tool="bcftools",
3125            bin_type="bin",
3126            config=config,
3127            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
3128        )
3129        if not bcftools_bin_command:
3130            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
3131            log.error(msg_err)
3132            raise ValueError(msg_err)
3133
3134        # Config - BCFTools databases folders
3135        databases_folders = set(
3136            self.get_config()
3137            .get("folders", {})
3138            .get("databases", {})
3139            .get("annotations", ["."])
3140            + self.get_config()
3141            .get("folders", {})
3142            .get("databases", {})
3143            .get("bcftools", ["."])
3144        )
3145        log.debug("Databases annotations: " + str(databases_folders))
3146
3147        # Param
3148        annotations = (
3149            self.get_param()
3150            .get("annotation", {})
3151            .get("snpsift", {})
3152            .get("annotations", None)
3153        )
3154        log.debug("Annotations: " + str(annotations))
3155
3156        # Assembly
3157        assembly = self.get_param().get(
3158            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
3159        )
3160
3161        # Data
3162        table_variants = self.get_table_variants()
3163
3164        # Check if not empty
3165        log.debug("Check if not empty")
3166        sql_query_chromosomes = (
3167            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
3168        )
3169        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
3170        if not sql_query_chromosomes_df["count"][0]:
3171            log.info(f"VCF empty")
3172            return
3173
3174        # VCF header
3175        vcf_reader = self.get_header()
3176        log.debug("Initial header: " + str(vcf_reader.infos))
3177
3178        # Existing annotations
3179        for vcf_annotation in self.get_header().infos:
3180
3181            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
3182            log.debug(
3183                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
3184            )
3185
3186        if annotations:
3187
3188            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
3189
3190                # Export VCF file
3191                tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz")
3192
3193                # Init
3194                commands = {}
3195
3196                for annotation in annotations:
3197                    annotation_fields = annotations[annotation]
3198
3199                    # Annotation Name
3200                    annotation_name = os.path.basename(annotation)
3201
3202                    if not annotation_fields:
3203                        annotation_fields = {"INFO": None}
3204
3205                    log.debug(f"Annotation '{annotation_name}'")
3206                    log.debug(
3207                        f"Annotation '{annotation_name}' - fields: {annotation_fields}"
3208                    )
3209
3210                    # Create Database
3211                    database = Database(
3212                        database=annotation,
3213                        databases_folders=databases_folders,
3214                        assembly=assembly,
3215                    )
3216
3217                    # Find files
3218                    db_file = database.get_database()
3219                    db_file = full_path(db_file)
3220                    db_hdr_file = database.get_header_file()
3221                    db_hdr_file = full_path(db_hdr_file)
3222                    db_file_type = database.get_format()
3223                    db_tbi_file = f"{db_file}.tbi"
3224                    db_file_compressed = database.is_compressed()
3225
3226                    # Check if compressed
3227                    if not db_file_compressed:
3228                        log.error(
3229                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
3230                        )
3231                        raise ValueError(
3232                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
3233                        )
3234
3235                    # Check if indexed
3236                    if not os.path.exists(db_tbi_file):
3237                        log.error(
3238                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
3239                        )
3240                        raise ValueError(
3241                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
3242                        )
3243
3244                    # Check index - try to create if not exists
3245                    if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
3246                        log.error("Annotation failed: database not valid")
3247                        log.error(f"Annotation annotation file: {db_file}")
3248                        log.error(f"Annotation annotation header: {db_hdr_file}")
3249                        log.error(f"Annotation annotation index: {db_tbi_file}")
3250                        raise ValueError(
3251                            f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
3252                        )
3253                    else:
3254
3255                        log.debug(
3256                            f"Annotation '{annotation}' - file: "
3257                            + str(db_file)
3258                            + " and "
3259                            + str(db_hdr_file)
3260                        )
3261
3262                        # Load header as VCF object
3263                        db_hdr_vcf = Variants(input=db_hdr_file)
3264                        db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
3265                        log.debug(
3266                            "Annotation database header: "
3267                            + str(db_hdr_vcf_header_infos)
3268                        )
3269
3270                        # For all fields in database
3271                        annotation_fields_full = False
3272                        if "ALL" in annotation_fields or "INFO" in annotation_fields:
3273                            annotation_fields = {
3274                                key: key for key in db_hdr_vcf_header_infos
3275                            }
3276                            log.debug(
3277                                "Annotation database header - All annotations added: "
3278                                + str(annotation_fields)
3279                            )
3280                            annotation_fields_full = True
3281
3282                        # # Create file for field rename
3283                        # log.debug("Create file for field rename")
3284                        # tmp_rename = NamedTemporaryFile(
3285                        #     prefix=self.get_prefix(),
3286                        #     dir=self.get_tmp_dir(),
3287                        #     suffix=".rename",
3288                        #     delete=False,
3289                        # )
3290                        # tmp_rename_name = tmp_rename.name
3291                        # tmp_files.append(tmp_rename_name)
3292
3293                        # Number of fields
3294                        nb_annotation_field = 0
3295                        annotation_list = []
3296                        annotation_infos_rename_list = []
3297
3298                        for annotation_field in annotation_fields:
3299
3300                            # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
3301                            annotation_fields_new_name = annotation_fields.get(
3302                                annotation_field, annotation_field
3303                            )
3304                            if not annotation_fields_new_name:
3305                                annotation_fields_new_name = annotation_field
3306
3307                            # Check if field is in DB and if field is not elready in input data
3308                            if (
3309                                annotation_field in db_hdr_vcf.get_header().infos
3310                                and annotation_fields_new_name
3311                                not in self.get_header().infos
3312                            ):
3313
3314                                log.info(
3315                                    f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
3316                                )
3317
3318                                # BCFTools annotate param to rename fields
3319                                if annotation_field != annotation_fields_new_name:
3320                                    annotation_infos_rename_list.append(
3321                                        f"{annotation_fields_new_name}:=INFO/{annotation_field}"
3322                                    )
3323
3324                                # Add INFO field to header
3325                                db_hdr_vcf_header_infos_number = (
3326                                    db_hdr_vcf_header_infos[annotation_field].num or "."
3327                                )
3328                                db_hdr_vcf_header_infos_type = (
3329                                    db_hdr_vcf_header_infos[annotation_field].type
3330                                    or "String"
3331                                )
3332                                db_hdr_vcf_header_infos_description = (
3333                                    db_hdr_vcf_header_infos[annotation_field].desc
3334                                    or f"{annotation_field} description"
3335                                )
3336                                db_hdr_vcf_header_infos_source = (
3337                                    db_hdr_vcf_header_infos[annotation_field].source
3338                                    or "unknown"
3339                                )
3340                                db_hdr_vcf_header_infos_version = (
3341                                    db_hdr_vcf_header_infos[annotation_field].version
3342                                    or "unknown"
3343                                )
3344
3345                                vcf_reader.infos[annotation_fields_new_name] = (
3346                                    vcf.parser._Info(
3347                                        annotation_fields_new_name,
3348                                        db_hdr_vcf_header_infos_number,
3349                                        db_hdr_vcf_header_infos_type,
3350                                        db_hdr_vcf_header_infos_description,
3351                                        db_hdr_vcf_header_infos_source,
3352                                        db_hdr_vcf_header_infos_version,
3353                                        self.code_type_map[
3354                                            db_hdr_vcf_header_infos_type
3355                                        ],
3356                                    )
3357                                )
3358
3359                                annotation_list.append(annotation_field)
3360
3361                                nb_annotation_field += 1
3362
3363                            else:
3364
3365                                if (
3366                                    annotation_field
3367                                    not in db_hdr_vcf.get_header().infos
3368                                ):
3369                                    log.warning(
3370                                        f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file"
3371                                    )
3372                                if (
3373                                    annotation_fields_new_name
3374                                    in self.get_header().infos
3375                                ):
3376                                    log.warning(
3377                                        f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)"
3378                                    )
3379
3380                        log.info(
3381                            f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
3382                        )
3383
3384                        annotation_infos = ",".join(annotation_list)
3385
3386                        if annotation_infos != "":
3387
3388                            # Annotated VCF (and error file)
3389                            tmp_annotation_vcf_name = os.path.join(
3390                                tmp_dir, os.path.basename(annotation) + ".vcf.gz"
3391                            )
3392                            tmp_annotation_vcf_name_err = (
3393                                tmp_annotation_vcf_name + ".err"
3394                            )
3395
3396                            # Add fields to annotate
3397                            if not annotation_fields_full:
3398                                annotation_infos_option = f"-info {annotation_infos}"
3399                            else:
3400                                annotation_infos_option = ""
3401
3402                            # Info fields rename
3403                            if annotation_infos_rename_list:
3404                                annotation_infos_rename = " -c " + ",".join(
3405                                    annotation_infos_rename_list
3406                                )
3407                            else:
3408                                annotation_infos_rename = ""
3409
3410                            # Annotate command
3411                            command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
3412
3413                            # Add command
3414                            commands[command_annotate] = tmp_annotation_vcf_name
3415
3416                if commands:
3417
3418                    # Export VCF file
3419                    self.export_variant_vcf(
3420                        vcf_file=tmp_vcf_name,
3421                        remove_info=True,
3422                        add_samples=False,
3423                        index=True,
3424                    )
3425                    shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf")
3426
3427                    # Num command
3428                    nb_command = 0
3429
3430                    # Annotate
3431                    for command_annotate in commands:
3432                        nb_command += 1
3433                        log.info(
3434                            f"Annotation - Annotate [{nb_command}/{len(commands)}]..."
3435                        )
3436                        log.debug(f"command_annotate={command_annotate}")
3437                        run_parallel_commands([command_annotate], threads)
3438
3439                        # Debug
3440                        shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf")
3441
3442                        # Update variants
3443                        log.info(
3444                            f"Annotation - Updating [{nb_command}/{len(commands)}]..."
3445                        )
3446                        self.update_from_vcf(commands[command_annotate])
3447
3448    def annotation_bcftools(self, threads: int = None) -> None:
3449        """
3450        This function annotate with bcftools
3451
3452        :param threads: Number of threads to use
3453        :return: the value of the variable "return_value".
3454        """
3455
3456        # DEBUG
3457        log.debug("Start annotation with bcftools databases")
3458
3459        # Threads
3460        if not threads:
3461            threads = self.get_threads()
3462        log.debug("Threads: " + str(threads))
3463
3464        # Config
3465        config = self.get_config()
3466        log.debug("Config: " + str(config))
3467
3468        # DEBUG
3469        delete_tmp = True
3470        if self.get_config().get("verbosity", "warning") in ["debug"]:
3471            delete_tmp = False
3472            log.debug("Delete tmp files/folders: " + str(delete_tmp))
3473
3474        # Config - BCFTools bin command
3475        bcftools_bin_command = get_bin_command(
3476            bin="bcftools",
3477            tool="bcftools",
3478            bin_type="bin",
3479            config=config,
3480            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
3481        )
3482        if not bcftools_bin_command:
3483            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
3484            log.error(msg_err)
3485            raise ValueError(msg_err)
3486
3487        # Config - BCFTools databases folders
3488        databases_folders = set(
3489            self.get_config()
3490            .get("folders", {})
3491            .get("databases", {})
3492            .get("annotations", ["."])
3493            + self.get_config()
3494            .get("folders", {})
3495            .get("databases", {})
3496            .get("bcftools", ["."])
3497        )
3498        log.debug("Databases annotations: " + str(databases_folders))
3499
3500        # Param
3501        annotations = (
3502            self.get_param()
3503            .get("annotation", {})
3504            .get("bcftools", {})
3505            .get("annotations", None)
3506        )
3507        log.debug("Annotations: " + str(annotations))
3508
3509        # Assembly
3510        assembly = self.get_param().get(
3511            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
3512        )
3513
3514        # Data
3515        table_variants = self.get_table_variants()
3516
3517        # Check if not empty
3518        log.debug("Check if not empty")
3519        sql_query_chromosomes = (
3520            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
3521        )
3522        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
3523        if not sql_query_chromosomes_df["count"][0]:
3524            log.info(f"VCF empty")
3525            return
3526
3527        # Export in VCF
3528        log.debug("Create initial file to annotate")
3529        tmp_vcf = NamedTemporaryFile(
3530            prefix=self.get_prefix(),
3531            dir=self.get_tmp_dir(),
3532            suffix=".vcf.gz",
3533            delete=False,
3534        )
3535        tmp_vcf_name = tmp_vcf.name
3536
3537        # VCF header
3538        vcf_reader = self.get_header()
3539        log.debug("Initial header: " + str(vcf_reader.infos))
3540
3541        # Existing annotations
3542        for vcf_annotation in self.get_header().infos:
3543
3544            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
3545            log.debug(
3546                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
3547            )
3548
3549        if annotations:
3550
3551            tmp_ann_vcf_list = []
3552            commands = []
3553            tmp_files = []
3554            err_files = []
3555
3556            for annotation in annotations:
3557                annotation_fields = annotations[annotation]
3558
3559                # Annotation Name
3560                annotation_name = os.path.basename(annotation)
3561
3562                if not annotation_fields:
3563                    annotation_fields = {"INFO": None}
3564
3565                log.debug(f"Annotation '{annotation_name}'")
3566                log.debug(
3567                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
3568                )
3569
3570                # Create Database
3571                database = Database(
3572                    database=annotation,
3573                    databases_folders=databases_folders,
3574                    assembly=assembly,
3575                )
3576
3577                # Find files
3578                db_file = database.get_database()
3579                db_file = full_path(db_file)
3580                db_hdr_file = database.get_header_file()
3581                db_hdr_file = full_path(db_hdr_file)
3582                db_file_type = database.get_format()
3583                db_tbi_file = f"{db_file}.tbi"
3584                db_file_compressed = database.is_compressed()
3585
3586                # Check if compressed
3587                if not db_file_compressed:
3588                    log.error(
3589                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
3590                    )
3591                    raise ValueError(
3592                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
3593                    )
3594
3595                # Check if indexed
3596                if not os.path.exists(db_tbi_file):
3597                    log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file")
3598                    raise ValueError(
3599                        f"Annotation '{annotation}' - {db_file} NOT indexed file"
3600                    )
3601
3602                # Check index - try to create if not exists
3603                if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
3604                    log.error("Annotation failed: database not valid")
3605                    log.error(f"Annotation annotation file: {db_file}")
3606                    log.error(f"Annotation annotation header: {db_hdr_file}")
3607                    log.error(f"Annotation annotation index: {db_tbi_file}")
3608                    raise ValueError(
3609                        f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
3610                    )
3611                else:
3612
3613                    log.debug(
3614                        f"Annotation '{annotation}' - file: "
3615                        + str(db_file)
3616                        + " and "
3617                        + str(db_hdr_file)
3618                    )
3619
3620                    # Load header as VCF object
3621                    db_hdr_vcf = Variants(input=db_hdr_file)
3622                    db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
3623                    log.debug(
3624                        "Annotation database header: " + str(db_hdr_vcf_header_infos)
3625                    )
3626
3627                    # For all fields in database
3628                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
3629                        annotation_fields = {
3630                            key: key for key in db_hdr_vcf_header_infos
3631                        }
3632                        log.debug(
3633                            "Annotation database header - All annotations added: "
3634                            + str(annotation_fields)
3635                        )
3636
3637                    # Number of fields
3638                    nb_annotation_field = 0
3639                    annotation_list = []
3640
3641                    for annotation_field in annotation_fields:
3642
3643                        # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
3644                        annotation_fields_new_name = annotation_fields.get(
3645                            annotation_field, annotation_field
3646                        )
3647                        if not annotation_fields_new_name:
3648                            annotation_fields_new_name = annotation_field
3649
3650                        # Check if field is in DB and if field is not elready in input data
3651                        if (
3652                            annotation_field in db_hdr_vcf.get_header().infos
3653                            and annotation_fields_new_name
3654                            not in self.get_header().infos
3655                        ):
3656
3657                            log.info(
3658                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
3659                            )
3660
3661                            # Add INFO field to header
3662                            db_hdr_vcf_header_infos_number = (
3663                                db_hdr_vcf_header_infos[annotation_field].num or "."
3664                            )
3665                            db_hdr_vcf_header_infos_type = (
3666                                db_hdr_vcf_header_infos[annotation_field].type
3667                                or "String"
3668                            )
3669                            db_hdr_vcf_header_infos_description = (
3670                                db_hdr_vcf_header_infos[annotation_field].desc
3671                                or f"{annotation_field} description"
3672                            )
3673                            db_hdr_vcf_header_infos_source = (
3674                                db_hdr_vcf_header_infos[annotation_field].source
3675                                or "unknown"
3676                            )
3677                            db_hdr_vcf_header_infos_version = (
3678                                db_hdr_vcf_header_infos[annotation_field].version
3679                                or "unknown"
3680                            )
3681
3682                            vcf_reader.infos[annotation_fields_new_name] = (
3683                                vcf.parser._Info(
3684                                    annotation_fields_new_name,
3685                                    db_hdr_vcf_header_infos_number,
3686                                    db_hdr_vcf_header_infos_type,
3687                                    db_hdr_vcf_header_infos_description,
3688                                    db_hdr_vcf_header_infos_source,
3689                                    db_hdr_vcf_header_infos_version,
3690                                    self.code_type_map[db_hdr_vcf_header_infos_type],
3691                                )
3692                            )
3693
3694                            # annotation_list.append(annotation_field)
3695                            if annotation_field != annotation_fields_new_name:
3696                                annotation_list.append(
3697                                    f"{annotation_fields_new_name}:=INFO/{annotation_field}"
3698                                )
3699                            else:
3700                                annotation_list.append(annotation_field)
3701
3702                            nb_annotation_field += 1
3703
3704                        else:
3705
3706                            if annotation_field not in db_hdr_vcf.get_header().infos:
3707                                log.warning(
3708                                    f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file"
3709                                )
3710                            if annotation_fields_new_name in self.get_header().infos:
3711                                log.warning(
3712                                    f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
3713                                )
3714
3715                    log.info(
3716                        f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
3717                    )
3718
3719                    annotation_infos = ",".join(annotation_list)
3720
3721                    if annotation_infos != "":
3722
3723                        # Protect header for bcftools (remove "#CHROM" and variants line)
3724                        log.debug("Protect Header file - remove #CHROM line if exists")
3725                        tmp_header_vcf = NamedTemporaryFile(
3726                            prefix=self.get_prefix(),
3727                            dir=self.get_tmp_dir(),
3728                            suffix=".hdr",
3729                            delete=False,
3730                        )
3731                        tmp_header_vcf_name = tmp_header_vcf.name
3732                        tmp_files.append(tmp_header_vcf_name)
3733                        # Command
3734                        if db_hdr_file.endswith(".gz"):
3735                            command_extract_header = f"zcat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
3736                        else:
3737                            command_extract_header = f"cat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
3738                        # Run
3739                        run_parallel_commands([command_extract_header], 1)
3740
3741                        # Find chomosomes
3742                        log.debug("Find chromosomes ")
3743                        sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\""""
3744                        sql_query_chromosomes_df = self.get_query_to_df(
3745                            sql_query_chromosomes
3746                        )
3747                        chomosomes_list = list(sql_query_chromosomes_df["CHROM"])
3748
3749                        log.debug("Chromosomes found: " + str(list(chomosomes_list)))
3750
3751                        # BED columns in the annotation file
3752                        if db_file_type in ["bed"]:
3753                            annotation_infos = "CHROM,POS,POS," + annotation_infos
3754
3755                        for chrom in chomosomes_list:
3756
3757                            # Create BED on initial VCF
3758                            log.debug("Create BED on initial VCF: " + str(tmp_vcf_name))
3759                            tmp_bed = NamedTemporaryFile(
3760                                prefix=self.get_prefix(),
3761                                dir=self.get_tmp_dir(),
3762                                suffix=".bed",
3763                                delete=False,
3764                            )
3765                            tmp_bed_name = tmp_bed.name
3766                            tmp_files.append(tmp_bed_name)
3767
3768                            # Detecte regions
3769                            log.debug(
3770                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..."
3771                            )
3772                            window = 1000000
3773                            sql_query_intervals_for_bed = f"""
3774                                SELECT  \"#CHROM\",
3775                                        CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END,
3776                                        \"POS\"+{window}
3777                                FROM {table_variants} as table_variants
3778                                WHERE table_variants.\"#CHROM\" = '{chrom}'
3779                            """
3780                            regions = self.conn.execute(
3781                                sql_query_intervals_for_bed
3782                            ).fetchall()
3783                            merged_regions = merge_regions(regions)
3784                            log.debug(
3785                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..."
3786                            )
3787
3788                            header = ["#CHROM", "START", "END"]
3789                            with open(tmp_bed_name, "w") as f:
3790                                # Write the header with tab delimiter
3791                                f.write("\t".join(header) + "\n")
3792                                for d in merged_regions:
3793                                    # Write each data row with tab delimiter
3794                                    f.write("\t".join(map(str, d)) + "\n")
3795
3796                            # Tmp files
3797                            tmp_annotation_vcf = NamedTemporaryFile(
3798                                prefix=self.get_prefix(),
3799                                dir=self.get_tmp_dir(),
3800                                suffix=".vcf.gz",
3801                                delete=False,
3802                            )
3803                            tmp_annotation_vcf_name = tmp_annotation_vcf.name
3804                            tmp_files.append(tmp_annotation_vcf_name)
3805                            tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}")
3806                            tmp_annotation_vcf_name_err = (
3807                                tmp_annotation_vcf_name + ".err"
3808                            )
3809                            err_files.append(tmp_annotation_vcf_name_err)
3810
3811                            # Annotate Command
3812                            log.debug(
3813                                f"Annotation '{annotation}' - add bcftools command"
3814                            )
3815
3816                            # Command
3817                            command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
3818
3819                            # Add command
3820                            commands.append(command_annotate)
3821
3822            # if some commands
3823            if commands:
3824
3825                # Export VCF file
3826                self.export_variant_vcf(
3827                    vcf_file=tmp_vcf_name,
3828                    remove_info=True,
3829                    add_samples=False,
3830                    index=True,
3831                )
3832
3833                # Threads
3834                # calculate threads for annotated commands
3835                if commands:
3836                    threads_bcftools_annotate = round(threads / len(commands))
3837                else:
3838                    threads_bcftools_annotate = 1
3839
3840                if not threads_bcftools_annotate:
3841                    threads_bcftools_annotate = 1
3842
3843                # Add threads option to bcftools commands
3844                if threads_bcftools_annotate > 1:
3845                    commands_threaded = []
3846                    for command in commands:
3847                        commands_threaded.append(
3848                            command.replace(
3849                                f"{bcftools_bin_command} annotate ",
3850                                f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ",
3851                            )
3852                        )
3853                    commands = commands_threaded
3854
3855                # Command annotation multithreading
3856                log.debug(f"Annotation - Annotation commands: " + str(commands))
3857                log.info(
3858                    f"Annotation - Annotation multithreaded in "
3859                    + str(len(commands))
3860                    + " commands"
3861                )
3862
3863                run_parallel_commands(commands, threads)
3864
3865                # Merge
3866                tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list)
3867
3868                if tmp_ann_vcf_list_cmd:
3869
3870                    # Tmp file
3871                    tmp_annotate_vcf = NamedTemporaryFile(
3872                        prefix=self.get_prefix(),
3873                        dir=self.get_tmp_dir(),
3874                        suffix=".vcf.gz",
3875                        delete=True,
3876                    )
3877                    tmp_annotate_vcf_name = tmp_annotate_vcf.name
3878                    tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
3879                    err_files.append(tmp_annotate_vcf_name_err)
3880
3881                    # Tmp file remove command
3882                    tmp_files_remove_command = ""
3883                    if tmp_files:
3884                        tmp_files_remove_command = " && rm -f " + " ".join(tmp_files)
3885
3886                    # Command merge
3887                    merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}"
3888                    log.info(
3889                        f"Annotation - Annotation merging "
3890                        + str(len(commands))
3891                        + " annotated files"
3892                    )
3893                    log.debug(f"Annotation - merge command: {merge_command}")
3894                    run_parallel_commands([merge_command], 1)
3895
3896                    # Error messages
3897                    log.info(f"Error/Warning messages:")
3898                    error_message_command_all = []
3899                    error_message_command_warning = []
3900                    error_message_command_err = []
3901                    for err_file in err_files:
3902                        with open(err_file, "r") as f:
3903                            for line in f:
3904                                message = line.strip()
3905                                error_message_command_all.append(message)
3906                                if line.startswith("[W::"):
3907                                    error_message_command_warning.append(message)
3908                                if line.startswith("[E::"):
3909                                    error_message_command_err.append(
3910                                        f"{err_file}: " + message
3911                                    )
3912                    # log info
3913                    for message in list(
3914                        set(error_message_command_err + error_message_command_warning)
3915                    ):
3916                        log.info(f"   {message}")
3917                    # debug info
3918                    for message in list(set(error_message_command_all)):
3919                        log.debug(f"   {message}")
3920                    # failed
3921                    if len(error_message_command_err):
3922                        log.error("Annotation failed: Error in commands")
3923                        raise ValueError("Annotation failed: Error in commands")
3924
3925                    # Update variants
3926                    log.info(f"Annotation - Updating...")
3927                    self.update_from_vcf(tmp_annotate_vcf_name)
3928
3929    def annotation_exomiser(self, threads: int = None) -> None:
3930        """
3931        This function annotate with Exomiser
3932
3933        This function uses args as parameters, in section "annotation" -> "exomiser", with sections:
3934        - "analysis" (dict/file):
3935            Full analysis dictionnary parameters (see Exomiser docs).
3936            Either a dict, or a file in JSON or YAML format.
3937            These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO)
3938            Default : None
3939        - "preset" (string):
3940            Analysis preset (available in config folder).
3941            Used if no full "analysis" is provided.
3942            Default: "exome"
3943        - "phenopacket" (dict/file):
3944            Samples and phenotipic features parameters (see Exomiser docs).
3945            Either a dict, or a file in JSON or YAML format.
3946            Default: None
3947        - "subject" (dict):
3948            Sample parameters (see Exomiser docs).
3949            Example:
3950                "subject":
3951                    {
3952                        "id": "ISDBM322017",
3953                        "sex": "FEMALE"
3954                    }
3955            Default: None
3956        - "sample" (string):
3957            Sample name to construct "subject" section:
3958                "subject":
3959                    {
3960                        "id": "<sample>",
3961                        "sex": "UNKNOWN_SEX"
3962                    }
3963            Default: None
3964        - "phenotypicFeatures" (dict)
3965            Phenotypic features to construct "subject" section.
3966            Example:
3967                "phenotypicFeatures":
3968                    [
3969                        { "type": { "id": "HP:0001159", "label": "Syndactyly" } },
3970                        { "type": { "id": "HP:0000486", "label": "Strabismus" } }
3971                    ]
3972        - "hpo" (list)
3973            List of HPO ids as phenotypic features.
3974            Example:
3975                "hpo": ['0001156', '0001363', '0011304', '0010055']
3976            Default: []
3977        - "outputOptions" (dict):
3978            Output options (see Exomiser docs).
3979            Default:
3980                "output_options" =
3981                    {
3982                        "outputContributingVariantsOnly": False,
3983                        "numGenes": 0,
3984                        "outputFormats": ["TSV_VARIANT", "VCF"]
3985                    }
3986        - "transcript_source" (string):
3987            Transcript source (either "refseq", "ucsc", "ensembl")
3988            Default: "refseq"
3989        - "exomiser_to_info" (boolean):
3990            Add exomiser TSV file columns as INFO fields in VCF.
3991            Default: False
3992        - "release" (string):
3993            Exomise database release.
3994            If not exists, database release will be downloaded (take a while).
3995            Default: None (provided by application.properties configuration file)
3996        - "exomiser_application_properties" (file):
3997            Exomiser configuration file (see Exomiser docs).
3998            Useful to automatically download databases (especially for specific genome databases).
3999
4000        Notes:
4001        - If no sample in parameters, first sample in VCF will be chosen
4002        - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
4003
4004        :param threads: The number of threads to use
4005        :return: None.
4006        """
4007
4008        # DEBUG
4009        log.debug("Start annotation with Exomiser databases")
4010
4011        # Threads
4012        if not threads:
4013            threads = self.get_threads()
4014        log.debug("Threads: " + str(threads))
4015
4016        # Config
4017        config = self.get_config()
4018        log.debug("Config: " + str(config))
4019
4020        # Config - Folders - Databases
4021        databases_folders = (
4022            config.get("folders", {})
4023            .get("databases", {})
4024            .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current")
4025        )
4026        databases_folders = full_path(databases_folders)
4027        if not os.path.exists(databases_folders):
4028            log.error(f"Databases annotations: {databases_folders} NOT found")
4029        log.debug("Databases annotations: " + str(databases_folders))
4030
4031        # Config - Exomiser
4032        exomiser_bin_command = get_bin_command(
4033            bin="exomiser-cli*.jar",
4034            tool="exomiser",
4035            bin_type="jar",
4036            config=config,
4037            default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser",
4038        )
4039        log.debug("Exomiser bin command: " + str(exomiser_bin_command))
4040        if not exomiser_bin_command:
4041            msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'"
4042            log.error(msg_err)
4043            raise ValueError(msg_err)
4044
4045        # Param
4046        param = self.get_param()
4047        log.debug("Param: " + str(param))
4048
4049        # Param - Exomiser
4050        param_exomiser = param.get("annotation", {}).get("exomiser", {})
4051        log.debug(f"Param Exomiser: {param_exomiser}")
4052
4053        # Param - Assembly
4054        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
4055        log.debug("Assembly: " + str(assembly))
4056
4057        # Data
4058        table_variants = self.get_table_variants()
4059
4060        # Check if not empty
4061        log.debug("Check if not empty")
4062        sql_query_chromosomes = (
4063            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
4064        )
4065        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
4066            log.info(f"VCF empty")
4067            return False
4068
4069        # VCF header
4070        vcf_reader = self.get_header()
4071        log.debug("Initial header: " + str(vcf_reader.infos))
4072
4073        # Samples
4074        samples = self.get_header_sample_list()
4075        if not samples:
4076            log.error("No Samples in VCF")
4077            return False
4078        log.debug(f"Samples: {samples}")
4079
4080        # Memory limit
4081        memory_limit = self.get_memory("8G")
4082        log.debug(f"memory_limit: {memory_limit}")
4083
4084        # Exomiser java options
4085        exomiser_java_options = (
4086            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
4087        )
4088        log.debug(f"Exomiser java options: {exomiser_java_options}")
4089
4090        # Download Exomiser (if not exists)
4091        exomiser_release = param_exomiser.get("release", None)
4092        exomiser_application_properties = param_exomiser.get(
4093            "exomiser_application_properties", None
4094        )
4095        databases_download_exomiser(
4096            assemblies=[assembly],
4097            exomiser_folder=databases_folders,
4098            exomiser_release=exomiser_release,
4099            exomiser_phenotype_release=exomiser_release,
4100            exomiser_application_properties=exomiser_application_properties,
4101        )
4102
4103        # Force annotation
4104        force_update_annotation = True
4105
4106        if "Exomiser" not in self.get_header().infos or force_update_annotation:
4107            log.debug("Start annotation Exomiser")
4108
4109            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
4110
4111                # tmp_dir = "/tmp/exomiser"
4112
4113                ### ANALYSIS ###
4114                ################
4115
4116                # Create analysis.json through analysis dict
4117                # either analysis in param or by default
4118                # depending on preset exome/genome)
4119
4120                # Init analysis dict
4121                param_exomiser_analysis_dict = {}
4122
4123                # analysis from param
4124                param_exomiser_analysis = param_exomiser.get("analysis", {})
4125                param_exomiser_analysis = full_path(param_exomiser_analysis)
4126
4127                # If analysis in param -> load anlaysis json
4128                if param_exomiser_analysis:
4129
4130                    # If param analysis is a file and exists
4131                    if isinstance(param_exomiser_analysis, str) and os.path.exists(
4132                        param_exomiser_analysis
4133                    ):
4134                        # Load analysis file into analysis dict (either yaml or json)
4135                        with open(param_exomiser_analysis) as json_file:
4136                            param_exomiser_analysis_dict = yaml.safe_load(json_file)
4137
4138                    # If param analysis is a dict
4139                    elif isinstance(param_exomiser_analysis, dict):
4140                        # Load analysis dict into analysis dict (either yaml or json)
4141                        param_exomiser_analysis_dict = param_exomiser_analysis
4142
4143                    # Error analysis type
4144                    else:
4145                        log.error(f"Analysis type unknown. Check param file.")
4146                        raise ValueError(f"Analysis type unknown. Check param file.")
4147
4148                # Case no input analysis config file/dict
4149                # Use preset (exome/genome) to open default config file
4150                if not param_exomiser_analysis_dict:
4151
4152                    # default preset
4153                    default_preset = "exome"
4154
4155                    # Get param preset or default preset
4156                    param_exomiser_preset = param_exomiser.get("preset", default_preset)
4157
4158                    # Try to find if preset is a file
4159                    if os.path.exists(param_exomiser_preset):
4160                        # Preset file is provided in full path
4161                        param_exomiser_analysis_default_config_file = (
4162                            param_exomiser_preset
4163                        )
4164                    # elif os.path.exists(full_path(param_exomiser_preset)):
4165                    #     # Preset file is provided in full path
4166                    #     param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset)
4167                    elif os.path.exists(
4168                        os.path.join(folder_config, param_exomiser_preset)
4169                    ):
4170                        # Preset file is provided a basename in config folder (can be a path with subfolders)
4171                        param_exomiser_analysis_default_config_file = os.path.join(
4172                            folder_config, param_exomiser_preset
4173                        )
4174                    else:
4175                        # Construct preset file
4176                        param_exomiser_analysis_default_config_file = os.path.join(
4177                            folder_config,
4178                            f"preset-{param_exomiser_preset}-analysis.json",
4179                        )
4180
4181                    # If preset file exists
4182                    param_exomiser_analysis_default_config_file = full_path(
4183                        param_exomiser_analysis_default_config_file
4184                    )
4185                    if os.path.exists(param_exomiser_analysis_default_config_file):
4186                        # Load prest file into analysis dict (either yaml or json)
4187                        with open(
4188                            param_exomiser_analysis_default_config_file
4189                        ) as json_file:
4190                            # param_exomiser_analysis_dict[""] = json.load(json_file)
4191                            param_exomiser_analysis_dict["analysis"] = yaml.safe_load(
4192                                json_file
4193                            )
4194
4195                    # Error preset file
4196                    else:
4197                        log.error(
4198                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
4199                        )
4200                        raise ValueError(
4201                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
4202                        )
4203
4204                # If no analysis dict created
4205                if not param_exomiser_analysis_dict:
4206                    log.error(f"No analysis config")
4207                    raise ValueError(f"No analysis config")
4208
4209                # Log
4210                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
4211
4212                ### PHENOPACKET ###
4213                ###################
4214
4215                # If no PhenoPacket in analysis dict -> check in param
4216                if "phenopacket" not in param_exomiser_analysis_dict:
4217
4218                    # If PhenoPacket in param -> load anlaysis json
4219                    if param_exomiser.get("phenopacket", None):
4220
4221                        param_exomiser_phenopacket = param_exomiser.get("phenopacket")
4222                        param_exomiser_phenopacket = full_path(
4223                            param_exomiser_phenopacket
4224                        )
4225
4226                        # If param phenopacket is a file and exists
4227                        if isinstance(
4228                            param_exomiser_phenopacket, str
4229                        ) and os.path.exists(param_exomiser_phenopacket):
4230                            # Load phenopacket file into analysis dict (either yaml or json)
4231                            with open(param_exomiser_phenopacket) as json_file:
4232                                param_exomiser_analysis_dict["phenopacket"] = (
4233                                    yaml.safe_load(json_file)
4234                                )
4235
4236                        # If param phenopacket is a dict
4237                        elif isinstance(param_exomiser_phenopacket, dict):
4238                            # Load phenopacket dict into analysis dict (either yaml or json)
4239                            param_exomiser_analysis_dict["phenopacket"] = (
4240                                param_exomiser_phenopacket
4241                            )
4242
4243                        # Error phenopacket type
4244                        else:
4245                            log.error(f"Phenopacket type unknown. Check param file.")
4246                            raise ValueError(
4247                                f"Phenopacket type unknown. Check param file."
4248                            )
4249
4250                # If no PhenoPacket in analysis dict -> construct from sample and HPO in param
4251                if "phenopacket" not in param_exomiser_analysis_dict:
4252
4253                    # Init PhenoPacket
4254                    param_exomiser_analysis_dict["phenopacket"] = {
4255                        "id": "analysis",
4256                        "proband": {},
4257                    }
4258
4259                    ### Add subject ###
4260
4261                    # If subject exists
4262                    param_exomiser_subject = param_exomiser.get("subject", {})
4263
4264                    # If subject not exists -> found sample ID
4265                    if not param_exomiser_subject:
4266
4267                        # Found sample ID in param
4268                        sample = param_exomiser.get("sample", None)
4269
4270                        # Find sample ID (first sample)
4271                        if not sample:
4272                            sample_list = self.get_header_sample_list()
4273                            if len(sample_list) > 0:
4274                                sample = sample_list[0]
4275                            else:
4276                                log.error(f"No sample found")
4277                                raise ValueError(f"No sample found")
4278
4279                        # Create subject
4280                        param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"}
4281
4282                    # Add to dict
4283                    param_exomiser_analysis_dict["phenopacket"][
4284                        "subject"
4285                    ] = param_exomiser_subject
4286
4287                    ### Add "phenotypicFeatures" ###
4288
4289                    # If phenotypicFeatures exists
4290                    param_exomiser_phenotypicfeatures = param_exomiser.get(
4291                        "phenotypicFeatures", []
4292                    )
4293
4294                    # If phenotypicFeatures not exists -> Try to infer from hpo list
4295                    if not param_exomiser_phenotypicfeatures:
4296
4297                        # Found HPO in param
4298                        param_exomiser_hpo = param_exomiser.get("hpo", [])
4299
4300                        # Split HPO if list in string format separated by comma
4301                        if isinstance(param_exomiser_hpo, str):
4302                            param_exomiser_hpo = param_exomiser_hpo.split(",")
4303
4304                        # Create HPO list
4305                        for hpo in param_exomiser_hpo:
4306                            hpo_clean = re.sub("[^0-9]", "", hpo)
4307                            param_exomiser_phenotypicfeatures.append(
4308                                {
4309                                    "type": {
4310                                        "id": f"HP:{hpo_clean}",
4311                                        "label": f"HP:{hpo_clean}",
4312                                    }
4313                                }
4314                            )
4315
4316                    # Add to dict
4317                    param_exomiser_analysis_dict["phenopacket"][
4318                        "phenotypicFeatures"
4319                    ] = param_exomiser_phenotypicfeatures
4320
4321                    # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step
4322                    if not param_exomiser_phenotypicfeatures:
4323                        for step in param_exomiser_analysis_dict.get(
4324                            "analysis", {}
4325                        ).get("steps", []):
4326                            if "hiPhivePrioritiser" in step:
4327                                param_exomiser_analysis_dict.get("analysis", {}).get(
4328                                    "steps", []
4329                                ).remove(step)
4330
4331                ### Add Input File ###
4332
4333                # Initial file name and htsFiles
4334                tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz")
4335                param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [
4336                    {
4337                        "uri": tmp_vcf_name,
4338                        "htsFormat": "VCF",
4339                        "genomeAssembly": assembly,
4340                    }
4341                ]
4342
4343                ### Add metaData ###
4344
4345                # If metaData not in analysis dict
4346                if "metaData" not in param_exomiser_analysis_dict:
4347                    param_exomiser_analysis_dict["phenopacket"]["metaData"] = {
4348                        "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z",
4349                        "createdBy": "howard",
4350                        "phenopacketSchemaVersion": 1,
4351                    }
4352
4353                ### OutputOptions ###
4354
4355                # Init output result folder
4356                output_results = os.path.join(tmp_dir, "results")
4357
4358                # If no outputOptions in analysis dict
4359                if "outputOptions" not in param_exomiser_analysis_dict:
4360
4361                    # default output formats
4362                    defaut_output_formats = ["TSV_VARIANT", "VCF"]
4363
4364                    # Get outputOptions in param
4365                    output_options = param_exomiser.get("outputOptions", None)
4366
4367                    # If no output_options in param -> check
4368                    if not output_options:
4369                        output_options = {
4370                            "outputContributingVariantsOnly": False,
4371                            "numGenes": 0,
4372                            "outputFormats": defaut_output_formats,
4373                        }
4374
4375                    # Replace outputDirectory in output options
4376                    output_options["outputDirectory"] = output_results
4377                    output_options["outputFileName"] = "howard"
4378
4379                    # Add outputOptions in analysis dict
4380                    param_exomiser_analysis_dict["outputOptions"] = output_options
4381
4382                else:
4383
4384                    # Replace output_results and output format (if exists in param)
4385                    param_exomiser_analysis_dict["outputOptions"][
4386                        "outputDirectory"
4387                    ] = output_results
4388                    param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = (
4389                        list(
4390                            set(
4391                                param_exomiser_analysis_dict.get(
4392                                    "outputOptions", {}
4393                                ).get("outputFormats", [])
4394                                + ["TSV_VARIANT", "VCF"]
4395                            )
4396                        )
4397                    )
4398
4399                # log
4400                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
4401
4402                ### ANALYSIS FILE ###
4403                #####################
4404
4405                ### Full JSON analysis config file ###
4406
4407                exomiser_analysis = os.path.join(tmp_dir, "analysis.json")
4408                with open(exomiser_analysis, "w") as fp:
4409                    json.dump(param_exomiser_analysis_dict, fp, indent=4)
4410
4411                ### SPLIT analysis and sample config files
4412
4413                # Splitted analysis dict
4414                param_exomiser_analysis_dict_for_split = (
4415                    param_exomiser_analysis_dict.copy()
4416                )
4417
4418                # Phenopacket JSON file
4419                exomiser_analysis_phenopacket = os.path.join(
4420                    tmp_dir, "analysis_phenopacket.json"
4421                )
4422                with open(exomiser_analysis_phenopacket, "w") as fp:
4423                    json.dump(
4424                        param_exomiser_analysis_dict_for_split.get("phenopacket"),
4425                        fp,
4426                        indent=4,
4427                    )
4428
4429                # Analysis JSON file without Phenopacket parameters
4430                param_exomiser_analysis_dict_for_split.pop("phenopacket")
4431                exomiser_analysis_analysis = os.path.join(
4432                    tmp_dir, "analysis_analysis.json"
4433                )
4434                with open(exomiser_analysis_analysis, "w") as fp:
4435                    json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4)
4436
4437                ### INITAL VCF file ###
4438                #######################
4439
4440                ### Create list of samples to use and include inti initial VCF file ####
4441
4442                # Subject (main sample)
4443                # Get sample ID in analysis dict
4444                sample_subject = (
4445                    param_exomiser_analysis_dict.get("phenopacket", {})
4446                    .get("subject", {})
4447                    .get("id", None)
4448                )
4449                sample_proband = (
4450                    param_exomiser_analysis_dict.get("phenopacket", {})
4451                    .get("proband", {})
4452                    .get("subject", {})
4453                    .get("id", None)
4454                )
4455                sample = []
4456                if sample_subject:
4457                    sample.append(sample_subject)
4458                if sample_proband:
4459                    sample.append(sample_proband)
4460
4461                # Get sample ID within Pedigree
4462                pedigree_persons_list = (
4463                    param_exomiser_analysis_dict.get("phenopacket", {})
4464                    .get("pedigree", {})
4465                    .get("persons", {})
4466                )
4467
4468                # Create list with all sample ID in pedigree (if exists)
4469                pedigree_persons = []
4470                for person in pedigree_persons_list:
4471                    pedigree_persons.append(person.get("individualId"))
4472
4473                # Concat subject sample ID and samples ID in pedigreesamples
4474                samples = list(set(sample + pedigree_persons))
4475
4476                # Check if sample list is not empty
4477                if not samples:
4478                    log.error(f"No samples found")
4479                    raise ValueError(f"No samples found")
4480
4481                # Create VCF with sample (either sample in param or first one by default)
4482                # Export VCF file
4483                self.export_variant_vcf(
4484                    vcf_file=tmp_vcf_name,
4485                    remove_info=True,
4486                    add_samples=True,
4487                    list_samples=samples,
4488                    index=False,
4489                )
4490
4491                ### Execute Exomiser ###
4492                ########################
4493
4494                # Init command
4495                exomiser_command = ""
4496
4497                # Command exomiser options
4498                exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} "
4499
4500                # Release
4501                exomiser_release = param_exomiser.get("release", None)
4502                if exomiser_release:
4503                    # phenotype data version
4504                    exomiser_options += (
4505                        f" --exomiser.phenotype.data-version={exomiser_release} "
4506                    )
4507                    # data version
4508                    exomiser_options += (
4509                        f" --exomiser.{assembly}.data-version={exomiser_release} "
4510                    )
4511                    # variant white list
4512                    variant_white_list_file = (
4513                        f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz"
4514                    )
4515                    if os.path.exists(
4516                        os.path.join(
4517                            databases_folders, assembly, variant_white_list_file
4518                        )
4519                    ):
4520                        exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} "
4521
4522                # transcript_source
4523                transcript_source = param_exomiser.get(
4524                    "transcript_source", None
4525                )  # ucsc, refseq, ensembl
4526                if transcript_source:
4527                    exomiser_options += (
4528                        f" --exomiser.{assembly}.transcript-source={transcript_source} "
4529                    )
4530
4531                # If analysis contain proband param
4532                if param_exomiser_analysis_dict.get("phenopacket", {}).get(
4533                    "proband", {}
4534                ):
4535                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} "
4536
4537                # If no proband (usually uniq sample)
4538                else:
4539                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}"
4540
4541                # Log
4542                log.debug(f"exomiser_command_analysis={exomiser_command_analysis}")
4543
4544                # Run command
4545                result = subprocess.call(
4546                    exomiser_command_analysis.split(), stdout=subprocess.PIPE
4547                )
4548                if result:
4549                    log.error("Exomiser command failed")
4550                    raise ValueError("Exomiser command failed")
4551
4552                ### RESULTS ###
4553                ###############
4554
4555                ### Annotate with TSV fields ###
4556
4557                # Init result tsv file
4558                exomiser_to_info = param_exomiser.get("exomiser_to_info", False)
4559
4560                # Init result tsv file
4561                output_results_tsv = os.path.join(output_results, "howard.variants.tsv")
4562
4563                # Parse TSV file and explode columns in INFO field
4564                if exomiser_to_info and os.path.exists(output_results_tsv):
4565
4566                    # Log
4567                    log.debug("Exomiser columns to VCF INFO field")
4568
4569                    # Retrieve columns and types
4570                    query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """
4571                    output_results_tsv_df = self.get_query_to_df(query)
4572                    output_results_tsv_columns = output_results_tsv_df.columns.tolist()
4573
4574                    # Init concat fields for update
4575                    sql_query_update_concat_fields = []
4576
4577                    # Fields to avoid
4578                    fields_to_avoid = [
4579                        "CONTIG",
4580                        "START",
4581                        "END",
4582                        "REF",
4583                        "ALT",
4584                        "QUAL",
4585                        "FILTER",
4586                        "GENOTYPE",
4587                    ]
4588
4589                    # List all columns to add into header
4590                    for header_column in output_results_tsv_columns:
4591
4592                        # If header column is enable
4593                        if header_column not in fields_to_avoid:
4594
4595                            # Header info type
4596                            header_info_type = "String"
4597                            header_column_df = output_results_tsv_df[header_column]
4598                            header_column_df_dtype = header_column_df.dtype
4599                            if header_column_df_dtype == object:
4600                                if (
4601                                    pd.to_numeric(header_column_df, errors="coerce")
4602                                    .notnull()
4603                                    .all()
4604                                ):
4605                                    header_info_type = "Float"
4606                            else:
4607                                header_info_type = "Integer"
4608
4609                            # Header info
4610                            characters_to_validate = ["-"]
4611                            pattern = "[" + "".join(characters_to_validate) + "]"
4612                            header_info_name = re.sub(
4613                                pattern,
4614                                "_",
4615                                f"Exomiser_{header_column}".replace("#", ""),
4616                            )
4617                            header_info_number = "."
4618                            header_info_description = (
4619                                f"Exomiser {header_column} annotation"
4620                            )
4621                            header_info_source = "Exomiser"
4622                            header_info_version = "unknown"
4623                            header_info_code = CODE_TYPE_MAP[header_info_type]
4624                            vcf_reader.infos[header_info_name] = vcf.parser._Info(
4625                                header_info_name,
4626                                header_info_number,
4627                                header_info_type,
4628                                header_info_description,
4629                                header_info_source,
4630                                header_info_version,
4631                                header_info_code,
4632                            )
4633
4634                            # Add field to add for update to concat fields
4635                            sql_query_update_concat_fields.append(
4636                                f"""
4637                                CASE
4638                                    WHEN table_parquet."{header_column}" NOT IN ('','.')
4639                                    THEN concat(
4640                                        '{header_info_name}=',
4641                                        table_parquet."{header_column}",
4642                                        ';'
4643                                        )
4644
4645                                    ELSE ''
4646                                END
4647                            """
4648                            )
4649
4650                    # Update query
4651                    sql_query_update = f"""
4652                        UPDATE {table_variants} as table_variants
4653                            SET INFO = concat(
4654                                            CASE
4655                                                WHEN INFO NOT IN ('', '.')
4656                                                THEN INFO
4657                                                ELSE ''
4658                                            END,
4659                                            CASE
4660                                                WHEN table_variants.INFO NOT IN ('','.')
4661                                                THEN ';'
4662                                                ELSE ''
4663                                            END,
4664                                            (
4665                                            SELECT 
4666                                                concat(
4667                                                    {",".join(sql_query_update_concat_fields)}
4668                                                )
4669                                            FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet
4670                                                    WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\"
4671                                                    AND table_parquet.\"START\" = table_variants.\"POS\"
4672                                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
4673                                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
4674                                            )
4675                                        )
4676                            ;
4677                        """
4678
4679                    # Update
4680                    self.conn.execute(sql_query_update)
4681
4682                ### Annotate with VCF INFO field ###
4683
4684                # Init result VCF file
4685                output_results_vcf = os.path.join(output_results, "howard.vcf.gz")
4686
4687                # If VCF exists
4688                if os.path.exists(output_results_vcf):
4689
4690                    # Log
4691                    log.debug("Exomiser result VCF update variants")
4692
4693                    # Find Exomiser INFO field annotation in header
4694                    with gzip.open(output_results_vcf, "rt") as f:
4695                        header_list = self.read_vcf_header(f)
4696                    exomiser_vcf_header = vcf.Reader(
4697                        io.StringIO("\n".join(header_list))
4698                    )
4699
4700                    # Add annotation INFO field to header
4701                    vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"]
4702
4703                    # Update variants with VCF
4704                    self.update_from_vcf(output_results_vcf)
4705
4706        return True
4707
4708    def annotation_snpeff(self, threads: int = None) -> None:
4709        """
4710        This function annotate with snpEff
4711
4712        :param threads: The number of threads to use
4713        :return: the value of the variable "return_value".
4714        """
4715
4716        # DEBUG
4717        log.debug("Start annotation with snpeff databases")
4718
4719        # Threads
4720        if not threads:
4721            threads = self.get_threads()
4722        log.debug("Threads: " + str(threads))
4723
4724        # DEBUG
4725        delete_tmp = True
4726        if self.get_config().get("verbosity", "warning") in ["debug"]:
4727            delete_tmp = False
4728            log.debug("Delete tmp files/folders: " + str(delete_tmp))
4729
4730        # Config
4731        config = self.get_config()
4732        log.debug("Config: " + str(config))
4733
4734        # Config - Folders - Databases
4735        databases_folders = (
4736            config.get("folders", {}).get("databases", {}).get("snpeff", ["."])
4737        )
4738        log.debug("Databases annotations: " + str(databases_folders))
4739
4740        # # Config - Java
4741        # java_bin = get_bin(
4742        #     tool="java",
4743        #     bin="java",
4744        #     bin_type="bin",
4745        #     config=config,
4746        #     default_folder="/usr/bin",
4747        # )
4748        # if not (os.path.exists(java_bin) or (java_bin and which(java_bin))):
4749        #     log.error(f"Annotation failed: no java bin '{java_bin}'")
4750        #     raise ValueError(f"Annotation failed: no java bin '{java_bin}'")
4751
4752        # # Config - snpEff bin
4753        # snpeff_jar = get_bin(
4754        #     tool="snpeff",
4755        #     bin="snpEff.jar",
4756        #     bin_type="jar",
4757        #     config=config,
4758        #     default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
4759        # )
4760        # if not (os.path.exists(snpeff_jar) or (snpeff_jar and which(snpeff_jar))):
4761        #     log.error(f"Annotation failed: no snpEff jar '{snpeff_jar}'")
4762        #     raise ValueError(f"Annotation failed: no snpEff jar '{snpeff_jar}'")
4763
4764        # Config - snpEff bin command
4765        snpeff_bin_command = get_bin_command(
4766            bin="snpEff.jar",
4767            tool="snpeff",
4768            bin_type="jar",
4769            config=config,
4770            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
4771        )
4772        if not snpeff_bin_command:
4773            msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'"
4774            log.error(msg_err)
4775            raise ValueError(msg_err)
4776
4777        # Config - snpEff databases
4778        snpeff_databases = (
4779            config.get("folders", {})
4780            .get("databases", {})
4781            .get("snpeff", DEFAULT_SNPEFF_FOLDER)
4782        )
4783        snpeff_databases = full_path(snpeff_databases)
4784        if snpeff_databases is not None and snpeff_databases != "":
4785            log.debug(f"Create snpEff databases folder")
4786            if not os.path.exists(snpeff_databases):
4787                os.makedirs(snpeff_databases)
4788
4789        # Param
4790        param = self.get_param()
4791        log.debug("Param: " + str(param))
4792
4793        # Param
4794        options = param.get("annotation", {}).get("snpeff", {}).get("options", None)
4795        log.debug("Options: " + str(options))
4796
4797        # Param - Assembly
4798        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
4799
4800        # Param - Options
4801        snpeff_options = (
4802            param.get("annotation", {}).get("snpeff", {}).get("options", "")
4803        )
4804        snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None)
4805        snpeff_csvstats = (
4806            param.get("annotation", {}).get("snpeff", {}).get("csvStats", None)
4807        )
4808        if snpeff_stats:
4809            snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output())
4810            snpeff_stats = full_path(snpeff_stats)
4811            snpeff_options += f" -stats {snpeff_stats}"
4812        if snpeff_csvstats:
4813            snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output())
4814            snpeff_csvstats = full_path(snpeff_csvstats)
4815            snpeff_options += f" -csvStats {snpeff_csvstats}"
4816
4817        # Data
4818        table_variants = self.get_table_variants()
4819
4820        # Check if not empty
4821        log.debug("Check if not empty")
4822        sql_query_chromosomes = (
4823            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
4824        )
4825        # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]:
4826        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
4827            log.info(f"VCF empty")
4828            return
4829
4830        # Export in VCF
4831        log.debug("Create initial file to annotate")
4832        tmp_vcf = NamedTemporaryFile(
4833            prefix=self.get_prefix(),
4834            dir=self.get_tmp_dir(),
4835            suffix=".vcf.gz",
4836            delete=True,
4837        )
4838        tmp_vcf_name = tmp_vcf.name
4839
4840        # VCF header
4841        vcf_reader = self.get_header()
4842        log.debug("Initial header: " + str(vcf_reader.infos))
4843
4844        # Existing annotations
4845        for vcf_annotation in self.get_header().infos:
4846
4847            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
4848            log.debug(
4849                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
4850            )
4851
4852        # Memory limit
4853        # if config.get("memory", None):
4854        #     memory_limit = config.get("memory", "8G")
4855        # else:
4856        #     memory_limit = "8G"
4857        memory_limit = self.get_memory("8G")
4858        log.debug(f"memory_limit: {memory_limit}")
4859
4860        # snpEff java options
4861        snpeff_java_options = (
4862            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
4863        )
4864        log.debug(f"Exomiser java options: {snpeff_java_options}")
4865
4866        force_update_annotation = True
4867
4868        if "ANN" not in self.get_header().infos or force_update_annotation:
4869
4870            # Check snpEff database
4871            log.debug(f"Check snpEff databases {[assembly]}")
4872            databases_download_snpeff(
4873                folder=snpeff_databases, assemblies=[assembly], config=config
4874            )
4875
4876            # Export VCF file
4877            self.export_variant_vcf(
4878                vcf_file=tmp_vcf_name,
4879                remove_info=True,
4880                add_samples=False,
4881                index=True,
4882            )
4883
4884            # Tmp file
4885            err_files = []
4886            tmp_annotate_vcf = NamedTemporaryFile(
4887                prefix=self.get_prefix(),
4888                dir=self.get_tmp_dir(),
4889                suffix=".vcf",
4890                delete=False,
4891            )
4892            tmp_annotate_vcf_name = tmp_annotate_vcf.name
4893            tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
4894            err_files.append(tmp_annotate_vcf_name_err)
4895
4896            # Command
4897            snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}"
4898            log.debug(f"Annotation - snpEff command: {snpeff_command}")
4899            run_parallel_commands([snpeff_command], 1)
4900
4901            # Error messages
4902            log.info(f"Error/Warning messages:")
4903            error_message_command_all = []
4904            error_message_command_warning = []
4905            error_message_command_err = []
4906            for err_file in err_files:
4907                with open(err_file, "r") as f:
4908                    for line in f:
4909                        message = line.strip()
4910                        error_message_command_all.append(message)
4911                        if line.startswith("[W::"):
4912                            error_message_command_warning.append(message)
4913                        if line.startswith("[E::"):
4914                            error_message_command_err.append(f"{err_file}: " + message)
4915            # log info
4916            for message in list(
4917                set(error_message_command_err + error_message_command_warning)
4918            ):
4919                log.info(f"   {message}")
4920            # debug info
4921            for message in list(set(error_message_command_all)):
4922                log.debug(f"   {message}")
4923            # failed
4924            if len(error_message_command_err):
4925                log.error("Annotation failed: Error in commands")
4926                raise ValueError("Annotation failed: Error in commands")
4927
4928            # Find annotation in header
4929            with open(tmp_annotate_vcf_name, "rt") as f:
4930                header_list = self.read_vcf_header(f)
4931            annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
4932
4933            for ann in annovar_vcf_header.infos:
4934                if ann not in self.get_header().infos:
4935                    vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
4936
4937            # Update variants
4938            log.info(f"Annotation - Updating...")
4939            self.update_from_vcf(tmp_annotate_vcf_name)
4940
4941        else:
4942            if "ANN" in self.get_header().infos:
4943                log.debug(f"Existing snpEff annotations in VCF")
4944            if force_update_annotation:
4945                log.debug(f"Existing snpEff annotations in VCF - annotation forced")
4946
4947    def annotation_annovar(self, threads: int = None) -> None:
4948        """
4949        It takes a VCF file, annotates it with Annovar, and then updates the database with the new
4950        annotations
4951
4952        :param threads: number of threads to use
4953        :return: the value of the variable "return_value".
4954        """
4955
4956        # DEBUG
4957        log.debug("Start annotation with Annovar databases")
4958
4959        # Threads
4960        if not threads:
4961            threads = self.get_threads()
4962        log.debug("Threads: " + str(threads))
4963
4964        # Tmp en Err files
4965        tmp_files = []
4966        err_files = []
4967
4968        # DEBUG
4969        delete_tmp = True
4970        if self.get_config().get("verbosity", "warning") in ["debug"]:
4971            delete_tmp = False
4972            log.debug("Delete tmp files/folders: " + str(delete_tmp))
4973
4974        # Config
4975        config = self.get_config()
4976        log.debug("Config: " + str(config))
4977
4978        # Config - Folders - Databases
4979        databases_folders = (
4980            config.get("folders", {}).get("databases", {}).get("annovar", ["."])
4981        )
4982        log.debug("Databases annotations: " + str(databases_folders))
4983
4984        # Config - annovar bin command
4985        annovar_bin_command = get_bin_command(
4986            bin="table_annovar.pl",
4987            tool="annovar",
4988            bin_type="perl",
4989            config=config,
4990            default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar",
4991        )
4992        if not annovar_bin_command:
4993            msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'"
4994            log.error(msg_err)
4995            raise ValueError(msg_err)
4996
4997        # Config - BCFTools bin command
4998        bcftools_bin_command = get_bin_command(
4999            bin="bcftools",
5000            tool="bcftools",
5001            bin_type="bin",
5002            config=config,
5003            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
5004        )
5005        if not bcftools_bin_command:
5006            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
5007            log.error(msg_err)
5008            raise ValueError(msg_err)
5009
5010        # Config - annovar databases
5011        annovar_databases = (
5012            config.get("folders", {})
5013            .get("databases", {})
5014            .get("annovar", DEFAULT_ANNOVAR_FOLDER)
5015        )
5016        annovar_databases = full_path(annovar_databases)
5017        if annovar_databases != "" and not os.path.exists(annovar_databases):
5018            os.makedirs(annovar_databases)
5019
5020        # Param
5021        param = self.get_param()
5022        log.debug("Param: " + str(param))
5023
5024        # Param - options
5025        options = param.get("annotation", {}).get("annovar", {}).get("options", {})
5026        log.debug("Options: " + str(options))
5027
5028        # Param - annotations
5029        annotations = (
5030            param.get("annotation", {}).get("annovar", {}).get("annotations", {})
5031        )
5032        log.debug("Annotations: " + str(annotations))
5033
5034        # Param - Assembly
5035        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
5036
5037        # Annovar database assembly
5038        annovar_databases_assembly = f"{annovar_databases}/{assembly}"
5039        if annovar_databases_assembly != "" and not os.path.exists(
5040            annovar_databases_assembly
5041        ):
5042            os.makedirs(annovar_databases_assembly)
5043
5044        # Data
5045        table_variants = self.get_table_variants()
5046
5047        # Check if not empty
5048        log.debug("Check if not empty")
5049        sql_query_chromosomes = (
5050            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
5051        )
5052        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
5053        if not sql_query_chromosomes_df["count"][0]:
5054            log.info(f"VCF empty")
5055            return
5056
5057        # VCF header
5058        vcf_reader = self.get_header()
5059        log.debug("Initial header: " + str(vcf_reader.infos))
5060
5061        # Existing annotations
5062        for vcf_annotation in self.get_header().infos:
5063
5064            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
5065            log.debug(
5066                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
5067            )
5068
5069        force_update_annotation = True
5070
5071        if annotations:
5072
5073            commands = []
5074            tmp_annotates_vcf_name_list = []
5075
5076            # Export in VCF
5077            log.debug("Create initial file to annotate")
5078            tmp_vcf = NamedTemporaryFile(
5079                prefix=self.get_prefix(),
5080                dir=self.get_tmp_dir(),
5081                suffix=".vcf.gz",
5082                delete=False,
5083            )
5084            tmp_vcf_name = tmp_vcf.name
5085            tmp_files.append(tmp_vcf_name)
5086            tmp_files.append(tmp_vcf_name + ".tbi")
5087
5088            # Export VCF file
5089            self.export_variant_vcf(
5090                vcf_file=tmp_vcf_name,
5091                remove_info=".",
5092                add_samples=False,
5093                index=True,
5094            )
5095
5096            # Create file for field rename
5097            log.debug("Create file for field rename")
5098            tmp_rename = NamedTemporaryFile(
5099                prefix=self.get_prefix(),
5100                dir=self.get_tmp_dir(),
5101                suffix=".rename",
5102                delete=False,
5103            )
5104            tmp_rename_name = tmp_rename.name
5105            tmp_files.append(tmp_rename_name)
5106
5107            # Check Annovar database
5108            log.debug(
5109                f"Check Annovar databases {[assembly]}: {list(annotations.keys())}"
5110            )
5111            databases_download_annovar(
5112                folder=annovar_databases,
5113                files=list(annotations.keys()),
5114                assemblies=[assembly],
5115            )
5116
5117            for annotation in annotations:
5118                annotation_fields = annotations[annotation]
5119
5120                if not annotation_fields:
5121                    annotation_fields = {"INFO": None}
5122
5123                log.info(f"Annotations Annovar - database '{annotation}'")
5124                log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}")
5125
5126                # Tmp file for annovar
5127                err_files = []
5128                tmp_annotate_vcf_directory = TemporaryDirectory(
5129                    prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar"
5130                )
5131                tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar"
5132                tmp_annotate_vcf_name_annovar = (
5133                    tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf"
5134                )
5135                tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err"
5136                err_files.append(tmp_annotate_vcf_name_err)
5137                tmp_files.append(tmp_annotate_vcf_name_err)
5138
5139                # Tmp file final vcf annotated by annovar
5140                tmp_annotate_vcf = NamedTemporaryFile(
5141                    prefix=self.get_prefix(),
5142                    dir=self.get_tmp_dir(),
5143                    suffix=".vcf.gz",
5144                    delete=False,
5145                )
5146                tmp_annotate_vcf_name = tmp_annotate_vcf.name
5147                tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name)
5148                tmp_files.append(tmp_annotate_vcf_name)
5149                tmp_files.append(tmp_annotate_vcf_name + ".tbi")
5150
5151                # Number of fields
5152                annotation_list = []
5153                annotation_renamed_list = []
5154
5155                for annotation_field in annotation_fields:
5156
5157                    # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
5158                    annotation_fields_new_name = annotation_fields.get(
5159                        annotation_field, annotation_field
5160                    )
5161                    if not annotation_fields_new_name:
5162                        annotation_fields_new_name = annotation_field
5163
5164                    if (
5165                        force_update_annotation
5166                        or annotation_fields_new_name not in self.get_header().infos
5167                    ):
5168                        annotation_list.append(annotation_field)
5169                        annotation_renamed_list.append(annotation_fields_new_name)
5170                    else:  # annotation_fields_new_name in self.get_header().infos and not force_update_annotation:
5171                        log.warning(
5172                            f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
5173                        )
5174
5175                    # Add rename info
5176                    run_parallel_commands(
5177                        [
5178                            f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}"
5179                        ],
5180                        1,
5181                    )
5182
5183                # log.debug("fields_to_removed: " + str(fields_to_removed))
5184                log.debug("annotation_list: " + str(annotation_list))
5185
5186                # protocol
5187                protocol = annotation
5188
5189                # argument
5190                argument = ""
5191
5192                # operation
5193                operation = "f"
5194                if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith(
5195                    "ensGene"
5196                ):
5197                    operation = "g"
5198                    if options.get("genebase", None):
5199                        argument = f"""'{options.get("genebase","")}'"""
5200                elif annotation in ["cytoBand"]:
5201                    operation = "r"
5202
5203                # argument option
5204                argument_option = ""
5205                if argument != "":
5206                    argument_option = " --argument " + argument
5207
5208                # command options
5209                command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """  # --intronhgvs 10
5210                for option in options:
5211                    if option not in ["genebase"]:
5212                        command_options += f""" --{option}={options[option]}"""
5213
5214                # Command
5215
5216                # Command - Annovar
5217                command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """
5218                tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf")
5219
5220                # Command - start pipe
5221                command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """
5222
5223                # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!)
5224                command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """
5225
5226                # Command - Special characters (refGene annotation)
5227                command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """
5228
5229                # Command - Clean empty fields (with value ".")
5230                command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """
5231
5232                # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file
5233                annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"]
5234                if "ALL" not in annotation_list and "INFO" not in annotation_list:
5235                    # for ann in annotation_renamed_list:
5236                    for ann in annotation_list:
5237                        annovar_fields_to_keep.append(f"^INFO/{ann}")
5238
5239                command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """
5240
5241                # Command - indexing
5242                command_annovar += f"""  && tabix {tmp_annotate_vcf_name} """
5243
5244                log.debug(f"Annotation - Annovar command: {command_annovar}")
5245                run_parallel_commands([command_annovar], 1)
5246
5247                # Error messages
5248                log.info(f"Error/Warning messages:")
5249                error_message_command_all = []
5250                error_message_command_warning = []
5251                error_message_command_err = []
5252                for err_file in err_files:
5253                    with open(err_file, "r") as f:
5254                        for line in f:
5255                            message = line.strip()
5256                            error_message_command_all.append(message)
5257                            if line.startswith("[W::") or line.startswith("WARNING"):
5258                                error_message_command_warning.append(message)
5259                            if line.startswith("[E::") or line.startswith("ERROR"):
5260                                error_message_command_err.append(
5261                                    f"{err_file}: " + message
5262                                )
5263                # log info
5264                for message in list(
5265                    set(error_message_command_err + error_message_command_warning)
5266                ):
5267                    log.info(f"   {message}")
5268                # debug info
5269                for message in list(set(error_message_command_all)):
5270                    log.debug(f"   {message}")
5271                # failed
5272                if len(error_message_command_err):
5273                    log.error("Annotation failed: Error in commands")
5274                    raise ValueError("Annotation failed: Error in commands")
5275
5276            if tmp_annotates_vcf_name_list:
5277
5278                # List of annotated files
5279                tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list)
5280
5281                # Tmp file
5282                tmp_annotate_vcf = NamedTemporaryFile(
5283                    prefix=self.get_prefix(),
5284                    dir=self.get_tmp_dir(),
5285                    suffix=".vcf.gz",
5286                    delete=False,
5287                )
5288                tmp_annotate_vcf_name = tmp_annotate_vcf.name
5289                tmp_files.append(tmp_annotate_vcf_name)
5290                tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
5291                err_files.append(tmp_annotate_vcf_name_err)
5292                tmp_files.append(tmp_annotate_vcf_name_err)
5293
5294                # Command merge
5295                merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} "
5296                log.info(
5297                    f"Annotation Annovar - Annotation merging "
5298                    + str(len(tmp_annotates_vcf_name_list))
5299                    + " annotated files"
5300                )
5301                log.debug(f"Annotation - merge command: {merge_command}")
5302                run_parallel_commands([merge_command], 1)
5303
5304                # Find annotation in header
5305                with bgzf.open(tmp_annotate_vcf_name, "rt") as f:
5306                    header_list = self.read_vcf_header(f)
5307                annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
5308
5309                for ann in annovar_vcf_header.infos:
5310                    if ann not in self.get_header().infos:
5311                        vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
5312
5313                # Update variants
5314                log.info(f"Annotation Annovar - Updating...")
5315                self.update_from_vcf(tmp_annotate_vcf_name)
5316
5317            # Clean files
5318            # Tmp file remove command
5319            if True:
5320                tmp_files_remove_command = ""
5321                if tmp_files:
5322                    tmp_files_remove_command = " ".join(tmp_files)
5323                clean_command = f" rm -f {tmp_files_remove_command} "
5324                log.debug(f"Annotation Annovar - Annotation cleaning ")
5325                log.debug(f"Annotation - cleaning command: {clean_command}")
5326                run_parallel_commands([clean_command], 1)
5327
5328    # Parquet
5329    def annotation_parquet(self, threads: int = None) -> None:
5330        """
5331        It takes a VCF file, and annotates it with a parquet file
5332
5333        :param threads: number of threads to use for the annotation
5334        :return: the value of the variable "result".
5335        """
5336
5337        # DEBUG
5338        log.debug("Start annotation with parquet databases")
5339
5340        # Threads
5341        if not threads:
5342            threads = self.get_threads()
5343        log.debug("Threads: " + str(threads))
5344
5345        # DEBUG
5346        delete_tmp = True
5347        if self.get_config().get("verbosity", "warning") in ["debug"]:
5348            delete_tmp = False
5349            log.debug("Delete tmp files/folders: " + str(delete_tmp))
5350
5351        # Config
5352        databases_folders = set(
5353            self.get_config()
5354            .get("folders", {})
5355            .get("databases", {})
5356            .get("annotations", ["."])
5357            + self.get_config()
5358            .get("folders", {})
5359            .get("databases", {})
5360            .get("parquet", ["."])
5361        )
5362        log.debug("Databases annotations: " + str(databases_folders))
5363
5364        # Param
5365        annotations = (
5366            self.get_param()
5367            .get("annotation", {})
5368            .get("parquet", {})
5369            .get("annotations", None)
5370        )
5371        log.debug("Annotations: " + str(annotations))
5372
5373        # Assembly
5374        assembly = self.get_param().get(
5375            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
5376        )
5377
5378        # Force Update Annotation
5379        force_update_annotation = (
5380            self.get_param()
5381            .get("annotation", {})
5382            .get("options", {})
5383            .get("annotations_update", False)
5384        )
5385        log.debug(f"force_update_annotation={force_update_annotation}")
5386        force_append_annotation = (
5387            self.get_param()
5388            .get("annotation", {})
5389            .get("options", {})
5390            .get("annotations_append", False)
5391        )
5392        log.debug(f"force_append_annotation={force_append_annotation}")
5393
5394        # Data
5395        table_variants = self.get_table_variants()
5396
5397        # Check if not empty
5398        log.debug("Check if not empty")
5399        sql_query_chromosomes_df = self.get_query_to_df(
5400            f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1"""
5401        )
5402        if not sql_query_chromosomes_df["count"][0]:
5403            log.info(f"VCF empty")
5404            return
5405
5406        # VCF header
5407        vcf_reader = self.get_header()
5408        log.debug("Initial header: " + str(vcf_reader.infos))
5409
5410        # Nb Variants POS
5411        log.debug("NB Variants Start")
5412        nb_variants = self.conn.execute(
5413            f"SELECT count(*) AS count FROM variants"
5414        ).fetchdf()["count"][0]
5415        log.debug("NB Variants Stop")
5416
5417        # Existing annotations
5418        for vcf_annotation in self.get_header().infos:
5419
5420            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
5421            log.debug(
5422                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
5423            )
5424
5425        # Added columns
5426        added_columns = []
5427
5428        # drop indexes
5429        log.debug(f"Drop indexes...")
5430        self.drop_indexes()
5431
5432        if annotations:
5433
5434            if "ALL" in annotations:
5435
5436                all_param = annotations.get("ALL", {})
5437                all_param_formats = all_param.get("formats", None)
5438                all_param_releases = all_param.get("releases", None)
5439
5440                databases_infos_dict = self.scan_databases(
5441                    database_formats=all_param_formats,
5442                    database_releases=all_param_releases,
5443                )
5444                for database_infos in databases_infos_dict.keys():
5445                    if database_infos not in annotations:
5446                        annotations[database_infos] = {"INFO": None}
5447
5448            for annotation in annotations:
5449
5450                if annotation in ["ALL"]:
5451                    continue
5452
5453                # Annotation Name
5454                annotation_name = os.path.basename(annotation)
5455
5456                # Annotation fields
5457                annotation_fields = annotations[annotation]
5458                if not annotation_fields:
5459                    annotation_fields = {"INFO": None}
5460
5461                log.debug(f"Annotation '{annotation_name}'")
5462                log.debug(
5463                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
5464                )
5465
5466                # Create Database
5467                database = Database(
5468                    database=annotation,
5469                    databases_folders=databases_folders,
5470                    assembly=assembly,
5471                )
5472
5473                # Find files
5474                parquet_file = database.get_database()
5475                parquet_hdr_file = database.get_header_file()
5476                parquet_type = database.get_type()
5477
5478                # Check if files exists
5479                if not parquet_file or not parquet_hdr_file:
5480                    log.error("Annotation failed: file not found")
5481                    raise ValueError("Annotation failed: file not found")
5482                else:
5483                    # Get parquet connexion
5484                    parquet_sql_attach = database.get_sql_database_attach(
5485                        output="query"
5486                    )
5487                    if parquet_sql_attach:
5488                        self.conn.execute(parquet_sql_attach)
5489                    parquet_file_link = database.get_sql_database_link()
5490                    # Log
5491                    log.debug(
5492                        f"Annotation '{annotation_name}' - file: "
5493                        + str(parquet_file)
5494                        + " and "
5495                        + str(parquet_hdr_file)
5496                    )
5497
5498                    # Database full header columns
5499                    parquet_hdr_vcf_header_columns = database.get_header_file_columns(
5500                        parquet_hdr_file
5501                    )
5502                    # Log
5503                    log.debug(
5504                        "Annotation database header columns : "
5505                        + str(parquet_hdr_vcf_header_columns)
5506                    )
5507
5508                    # Load header as VCF object
5509                    parquet_hdr_vcf_header_infos = database.get_header().infos
5510                    # Log
5511                    log.debug(
5512                        "Annotation database header: "
5513                        + str(parquet_hdr_vcf_header_infos)
5514                    )
5515
5516                    # Get extra infos
5517                    parquet_columns = database.get_extra_columns()
5518                    # Log
5519                    log.debug("Annotation database Columns: " + str(parquet_columns))
5520
5521                    # Add extra columns if "ALL" in annotation_fields
5522                    # if "ALL" in annotation_fields:
5523                    #     allow_add_extra_column = True
5524                    if "ALL" in annotation_fields and database.get_extra_columns():
5525                        for extra_column in database.get_extra_columns():
5526                            if (
5527                                extra_column not in annotation_fields
5528                                and extra_column.replace("INFO/", "")
5529                                not in parquet_hdr_vcf_header_infos
5530                            ):
5531                                parquet_hdr_vcf_header_infos[extra_column] = (
5532                                    vcf.parser._Info(
5533                                        extra_column,
5534                                        ".",
5535                                        "String",
5536                                        f"{extra_column} description",
5537                                        "unknown",
5538                                        "unknown",
5539                                        self.code_type_map["String"],
5540                                    )
5541                                )
5542
5543                    # For all fields in database
5544                    annotation_fields_all = False
5545                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
5546                        annotation_fields_all = True
5547                        annotation_fields = {
5548                            key: key for key in parquet_hdr_vcf_header_infos
5549                        }
5550
5551                        log.debug(
5552                            "Annotation database header - All annotations added: "
5553                            + str(annotation_fields)
5554                        )
5555
5556                    # Init
5557
5558                    # List of annotation fields to use
5559                    sql_query_annotation_update_info_sets = []
5560
5561                    # List of annotation to agregate
5562                    sql_query_annotation_to_agregate = []
5563
5564                    # Number of fields
5565                    nb_annotation_field = 0
5566
5567                    # Annotation fields processed
5568                    annotation_fields_processed = []
5569
5570                    # Columns mapping
5571                    map_columns = database.map_columns(
5572                        columns=annotation_fields, prefixes=["INFO/"]
5573                    )
5574
5575                    # Query dict for fields to remove (update option)
5576                    query_dict_remove = {}
5577
5578                    # Fetch Anotation fields
5579                    for annotation_field in annotation_fields:
5580
5581                        # annotation_field_column
5582                        annotation_field_column = map_columns.get(
5583                            annotation_field, "INFO"
5584                        )
5585
5586                        # field new name, if parametered
5587                        annotation_fields_new_name = annotation_fields.get(
5588                            annotation_field, annotation_field
5589                        )
5590                        if not annotation_fields_new_name:
5591                            annotation_fields_new_name = annotation_field
5592
5593                        # To annotate
5594                        # force_update_annotation = True
5595                        # force_append_annotation = True
5596                        # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)):
5597                        if annotation_field in parquet_hdr_vcf_header_infos and (
5598                            force_update_annotation
5599                            or force_append_annotation
5600                            or (
5601                                annotation_fields_new_name
5602                                not in self.get_header().infos
5603                            )
5604                        ):
5605
5606                            # Add field to annotation to process list
5607                            annotation_fields_processed.append(
5608                                annotation_fields_new_name
5609                            )
5610
5611                            # explode infos for the field
5612                            annotation_fields_new_name_info_msg = ""
5613                            if (
5614                                force_update_annotation
5615                                and annotation_fields_new_name
5616                                in self.get_header().infos
5617                            ):
5618                                # Remove field from INFO
5619                                query = f"""
5620                                    UPDATE {table_variants} as table_variants
5621                                    SET INFO = REGEXP_REPLACE(
5622                                                concat(table_variants.INFO,''),
5623                                                ';*{annotation_fields_new_name}=[^;]*',
5624                                                ''
5625                                                )
5626                                    WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%'
5627                                """
5628                                annotation_fields_new_name_info_msg = " [update]"
5629                                query_dict_remove[
5630                                    f"remove 'INFO/{annotation_fields_new_name}'"
5631                                ] = query
5632
5633                            # Sep between fields in INFO
5634                            nb_annotation_field += 1
5635                            if nb_annotation_field > 1:
5636                                annotation_field_sep = ";"
5637                            else:
5638                                annotation_field_sep = ""
5639
5640                            log.info(
5641                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}"
5642                            )
5643
5644                            # Add INFO field to header
5645                            parquet_hdr_vcf_header_infos_number = (
5646                                parquet_hdr_vcf_header_infos[annotation_field].num
5647                                or "."
5648                            )
5649                            parquet_hdr_vcf_header_infos_type = (
5650                                parquet_hdr_vcf_header_infos[annotation_field].type
5651                                or "String"
5652                            )
5653                            parquet_hdr_vcf_header_infos_description = (
5654                                parquet_hdr_vcf_header_infos[annotation_field].desc
5655                                or f"{annotation_field} description"
5656                            )
5657                            parquet_hdr_vcf_header_infos_source = (
5658                                parquet_hdr_vcf_header_infos[annotation_field].source
5659                                or "unknown"
5660                            )
5661                            parquet_hdr_vcf_header_infos_version = (
5662                                parquet_hdr_vcf_header_infos[annotation_field].version
5663                                or "unknown"
5664                            )
5665
5666                            vcf_reader.infos[annotation_fields_new_name] = (
5667                                vcf.parser._Info(
5668                                    annotation_fields_new_name,
5669                                    parquet_hdr_vcf_header_infos_number,
5670                                    parquet_hdr_vcf_header_infos_type,
5671                                    parquet_hdr_vcf_header_infos_description,
5672                                    parquet_hdr_vcf_header_infos_source,
5673                                    parquet_hdr_vcf_header_infos_version,
5674                                    self.code_type_map[
5675                                        parquet_hdr_vcf_header_infos_type
5676                                    ],
5677                                )
5678                            )
5679
5680                            # Append
5681                            if force_append_annotation:
5682                                query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """
5683                            else:
5684                                query_case_when_append = ""
5685
5686                            # Annotation/Update query fields
5687                            # Found in INFO column
5688                            if (
5689                                annotation_field_column == "INFO"
5690                                and "INFO" in parquet_hdr_vcf_header_columns
5691                            ):
5692                                sql_query_annotation_update_info_sets.append(
5693                                    f"""
5694                                CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append}
5695                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1))
5696                                        ELSE ''
5697                                    END
5698                                """
5699                                )
5700                            # Found in a specific column
5701                            else:
5702                                sql_query_annotation_update_info_sets.append(
5703                                    f"""
5704                                CASE WHEN table_parquet."{annotation_field_column}" NOT IN ('','.') {query_case_when_append}
5705                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(table_parquet."{annotation_field_column}", ';', ','))
5706                                        ELSE ''
5707                                    END
5708                                """
5709                                )
5710                                sql_query_annotation_to_agregate.append(
5711                                    f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """
5712                                )
5713
5714                        # Not to annotate
5715                        else:
5716
5717                            if force_update_annotation:
5718                                annotation_message = "forced"
5719                            else:
5720                                annotation_message = "skipped"
5721
5722                            if annotation_field not in parquet_hdr_vcf_header_infos:
5723                                log.warning(
5724                                    f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file"
5725                                )
5726                            if annotation_fields_new_name in self.get_header().infos:
5727                                log.warning(
5728                                    f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})"
5729                                )
5730
5731                    # Check if ALL fields have to be annotated. Thus concat all INFO field
5732                    # allow_annotation_full_info = True
5733                    allow_annotation_full_info = not force_append_annotation
5734
5735                    if parquet_type in ["regions"]:
5736                        allow_annotation_full_info = False
5737
5738                    if (
5739                        allow_annotation_full_info
5740                        and nb_annotation_field == len(annotation_fields)
5741                        and annotation_fields_all
5742                        and (
5743                            "INFO" in parquet_hdr_vcf_header_columns
5744                            and "INFO" in database.get_extra_columns()
5745                        )
5746                    ):
5747                        log.debug("Column INFO annotation enabled")
5748                        sql_query_annotation_update_info_sets = []
5749                        sql_query_annotation_update_info_sets.append(
5750                            f" table_parquet.INFO "
5751                        )
5752
5753                    if sql_query_annotation_update_info_sets:
5754
5755                        # Annotate
5756                        log.info(f"Annotation '{annotation_name}' - Annotation...")
5757
5758                        # Join query annotation update info sets for SQL
5759                        sql_query_annotation_update_info_sets_sql = ",".join(
5760                            sql_query_annotation_update_info_sets
5761                        )
5762
5763                        # Check chromosomes list (and variants infos)
5764                        sql_query_chromosomes = f"""
5765                            SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants
5766                            FROM {table_variants} as table_variants
5767                            GROUP BY table_variants."#CHROM"
5768                            ORDER BY table_variants."#CHROM"
5769                            """
5770                        sql_query_chromosomes_df = self.conn.execute(
5771                            sql_query_chromosomes
5772                        ).df()
5773                        sql_query_chromosomes_dict = {
5774                            entry["CHROM"]: {
5775                                "count": entry["count_variants"],
5776                                "min": entry["min_variants"],
5777                                "max": entry["max_variants"],
5778                            }
5779                            for index, entry in sql_query_chromosomes_df.iterrows()
5780                        }
5781
5782                        # Init
5783                        nb_of_query = 0
5784                        nb_of_variant_annotated = 0
5785                        query_dict = query_dict_remove
5786
5787                        # for chrom in sql_query_chromosomes_df["CHROM"]:
5788                        for chrom in sql_query_chromosomes_dict:
5789
5790                            # Number of variant by chromosome
5791                            nb_of_variant_by_chrom = sql_query_chromosomes_dict.get(
5792                                chrom, {}
5793                            ).get("count", 0)
5794
5795                            log.debug(
5796                                f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..."
5797                            )
5798
5799                            # Annotation with regions database
5800                            if parquet_type in ["regions"]:
5801                                sql_query_annotation_from_clause = f"""
5802                                    FROM (
5803                                        SELECT 
5804                                            '{chrom}' AS \"#CHROM\",
5805                                            table_variants_from.\"POS\" AS \"POS\",
5806                                            {",".join(sql_query_annotation_to_agregate)}
5807                                        FROM {table_variants} as table_variants_from
5808                                        LEFT JOIN {parquet_file_link} as table_parquet_from ON (
5809                                            table_parquet_from."#CHROM" = '{chrom}'
5810                                            AND table_variants_from.\"POS\" <= table_parquet_from.\"END\"
5811                                            AND (table_variants_from.\"POS\" >= (table_parquet_from.\"START\"+1)
5812                                                OR table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1)
5813                                                )
5814                                        )
5815                                        WHERE table_variants_from.\"#CHROM\" in ('{chrom}')
5816                                        GROUP BY table_variants_from.\"POS\"
5817                                        )
5818                                        as table_parquet
5819                                """
5820
5821                                sql_query_annotation_where_clause = """
5822                                    table_parquet.\"#CHROM\" = table_variants.\"#CHROM\"
5823                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
5824                                """
5825
5826                            # Annotation with variants database
5827                            else:
5828                                sql_query_annotation_from_clause = f"""
5829                                    FROM {parquet_file_link} as table_parquet
5830                                """
5831                                sql_query_annotation_where_clause = f"""
5832                                    table_variants."#CHROM" = '{chrom}'
5833                                    AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 
5834                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
5835                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
5836                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
5837                                """
5838
5839                            # Create update query
5840                            sql_query_annotation_chrom_interval_pos = f"""
5841                                UPDATE {table_variants} as table_variants
5842                                    SET INFO = 
5843                                        concat(
5844                                            CASE WHEN table_variants.INFO NOT IN ('','.')
5845                                                THEN table_variants.INFO
5846                                                ELSE ''
5847                                            END
5848                                            ,
5849                                            CASE WHEN table_variants.INFO NOT IN ('','.')
5850                                                        AND (
5851                                                        concat({sql_query_annotation_update_info_sets_sql})
5852                                                        )
5853                                                        NOT IN ('','.') 
5854                                                    THEN ';'
5855                                                    ELSE ''
5856                                            END
5857                                            ,
5858                                            {sql_query_annotation_update_info_sets_sql}
5859                                            )
5860                                    {sql_query_annotation_from_clause}
5861                                    WHERE {sql_query_annotation_where_clause}
5862                                    ;
5863                                """
5864
5865                            # Add update query to dict
5866                            query_dict[
5867                                f"{chrom} [{nb_of_variant_by_chrom} variants]"
5868                            ] = sql_query_annotation_chrom_interval_pos
5869
5870                        nb_of_query = len(query_dict)
5871                        num_query = 0
5872
5873                        # SET max_expression_depth TO x
5874                        self.conn.execute("SET max_expression_depth TO 10000")
5875
5876                        for query_name in query_dict:
5877                            query = query_dict[query_name]
5878                            num_query += 1
5879                            log.info(
5880                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..."
5881                            )
5882                            result = self.conn.execute(query)
5883                            nb_of_variant_annotated_by_query = result.df()["Count"][0]
5884                            nb_of_variant_annotated += nb_of_variant_annotated_by_query
5885                            log.info(
5886                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated"
5887                            )
5888
5889                        log.info(
5890                            f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)"
5891                        )
5892
5893                    else:
5894
5895                        log.info(
5896                            f"Annotation '{annotation_name}' - No Annotations available"
5897                        )
5898
5899                    log.debug("Final header: " + str(vcf_reader.infos))
5900
5901        # Remove added columns
5902        for added_column in added_columns:
5903            self.drop_column(column=added_column)
5904
5905    def annotation_splice(self, threads: int = None) -> None:
5906        """
5907        This function annotate with snpEff
5908
5909        :param threads: The number of threads to use
5910        :return: the value of the variable "return_value".
5911        """
5912
5913        # DEBUG
5914        log.debug("Start annotation with splice tools")
5915
5916        # Threads
5917        if not threads:
5918            threads = self.get_threads()
5919        log.debug("Threads: " + str(threads))
5920
5921        # DEBUG
5922        delete_tmp = True
5923        if self.get_config().get("verbosity", "warning") in ["debug"]:
5924            delete_tmp = False
5925            log.debug("Delete tmp files/folders: " + str(delete_tmp))
5926
5927        # Config
5928        config = self.get_config()
5929        log.debug("Config: " + str(config))
5930        splice_config = config.get("tools", {}).get("splice", {})
5931        if not splice_config:
5932            splice_config = DEFAULT_TOOLS_BIN.get("splice", {})
5933        if not splice_config:
5934            msg_err = "No Splice tool config"
5935            log.error(msg_err)
5936            raise ValueError(msg_err)
5937        log.debug(f"splice_config={splice_config}")
5938
5939        # Config - Folders - Databases
5940        databases_folders = (
5941            config.get("folders", {}).get("databases", {}).get("splice", ["."])
5942        )
5943        log.debug("Databases annotations: " + str(databases_folders))
5944
5945        # Splice docker image
5946        splice_docker_image = splice_config.get("docker").get("image")
5947
5948        # Pull splice image if it's not already there
5949        if not check_docker_image_exists(splice_docker_image):
5950            log.warning(
5951                f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub"
5952            )
5953            try:
5954                command(f"docker pull {splice_config.get('docker').get('image')}")
5955            except subprocess.CalledProcessError:
5956                msg_err = f"Unable to find docker {splice_docker_image} on dockerhub"
5957                log.error(msg_err)
5958                raise ValueError(msg_err)
5959                return None
5960
5961        # Config - splice databases
5962        splice_databases = (
5963            config.get("folders", {})
5964            .get("databases", {})
5965            .get("splice", DEFAULT_SPLICE_FOLDER)
5966        )
5967        splice_databases = full_path(splice_databases)
5968
5969        # Param
5970        param = self.get_param()
5971        log.debug("Param: " + str(param))
5972
5973        # Param
5974        options = param.get("annotation", {}).get("splice", {})
5975        log.debug("Options: " + str(options))
5976
5977        # Data
5978        table_variants = self.get_table_variants()
5979
5980        # Check if not empty
5981        log.debug("Check if not empty")
5982        sql_query_chromosomes = (
5983            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
5984        )
5985        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
5986            log.info("VCF empty")
5987            return None
5988
5989        # Export in VCF
5990        log.debug("Create initial file to annotate")
5991
5992        # Create output folder
5993        output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}")
5994        if not os.path.exists(output_folder):
5995            Path(output_folder).mkdir(parents=True, exist_ok=True)
5996
5997        # Create tmp VCF file
5998        tmp_vcf = NamedTemporaryFile(
5999            prefix=self.get_prefix(),
6000            dir=output_folder,
6001            suffix=".vcf",
6002            delete=False,
6003        )
6004        tmp_vcf_name = tmp_vcf.name
6005
6006        # VCF header
6007        header = self.get_header()
6008
6009        # Existing annotations
6010        for vcf_annotation in self.get_header().infos:
6011
6012            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
6013            log.debug(
6014                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
6015            )
6016
6017        # Memory limit
6018        if config.get("memory", None):
6019            memory_limit = config.get("memory", "8G").upper()
6020            # upper()
6021        else:
6022            memory_limit = "8G"
6023        log.debug(f"memory_limit: {memory_limit}")
6024
6025        # Check number of variants to annotate
6026        where_clause_regex_spliceai = r"SpliceAI_\w+"
6027        where_clause_regex_spip = r"SPiP_\w+"
6028        where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')"""
6029        df_list_of_variants_to_annotate = self.get_query_to_df(
6030            query=f""" SELECT * FROM variants {where_clause} """
6031        )
6032        if len(df_list_of_variants_to_annotate) == 0:
6033            log.warning(
6034                f"No variants to annotate with splice. Variants probably already annotated with splice"
6035            )
6036            return None
6037        else:
6038            log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants")
6039
6040        # Export VCF file
6041        self.export_variant_vcf(
6042            vcf_file=tmp_vcf_name,
6043            remove_info=True,
6044            add_samples=True,
6045            index=False,
6046            where_clause=where_clause,
6047        )
6048
6049        # Create docker container and launch splice analysis
6050        if splice_config:
6051
6052            # Splice mount folders
6053            mount_folders = splice_config.get("mount", {})
6054
6055            # Genome mount
6056            mount_folders[
6057                config.get("folders", {})
6058                .get("databases", {})
6059                .get("genomes", DEFAULT_GENOME_FOLDER)
6060            ] = "ro"
6061
6062            # SpliceAI mount
6063            mount_folders[
6064                config.get("folders", {})
6065                .get("databases", {})
6066                .get("spliceai", DEFAULT_SPLICEAI_FOLDER)
6067            ] = "ro"
6068
6069            # Genome mount
6070            mount_folders[
6071                config.get("folders", {})
6072                .get("databases", {})
6073                .get("spip", DEFAULT_SPIP_FOLDER)
6074            ] = "ro"
6075
6076            # Mount folders
6077            mount = []
6078
6079            # Config mount
6080            mount = [
6081                f"-v {full_path(path)}:{full_path(path)}:{mode}"
6082                for path, mode in mount_folders.items()
6083            ]
6084
6085            if any(value for value in splice_config.values() if value is None):
6086                log.warning("At least one splice config parameter is empty")
6087                return None
6088
6089            # Params in splice nf
6090            def check_values(dico: dict):
6091                """
6092                Ensure parameters for NF splice pipeline
6093                """
6094                for key, val in dico.items():
6095                    if key == "genome":
6096                        if any(
6097                            assemb in options.get("genome", {})
6098                            for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"]
6099                        ):
6100                            yield f"--{key} hg19"
6101                        elif any(
6102                            assemb in options.get("genome", {})
6103                            for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"]
6104                        ):
6105                            yield f"--{key} hg38"
6106                    elif (
6107                        (isinstance(val, str) and val)
6108                        or isinstance(val, int)
6109                        or isinstance(val, bool)
6110                    ):
6111                        yield f"--{key} {val}"
6112
6113            # Genome
6114            genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY))
6115            options["genome"] = genome
6116
6117            # NF params
6118            nf_params = []
6119
6120            # Add options
6121            if options:
6122                nf_params = list(check_values(options))
6123                log.debug(f"Splice NF params: {' '.join(nf_params)}")
6124            else:
6125                log.debug("No NF params provided")
6126
6127            # Add threads
6128            if "threads" not in options.keys():
6129                nf_params.append(f"--threads {threads}")
6130
6131            # Genome path
6132            genome_path = find_genome(
6133                config.get("folders", {})
6134                .get("databases", {})
6135                .get("genomes", DEFAULT_GENOME_FOLDER),
6136                file=f"{genome}.fa",
6137            )
6138            # Add genome path
6139            if not genome_path:
6140                raise ValueError(
6141                    f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}"
6142                )
6143            else:
6144                log.debug(f"Genome: {genome_path}")
6145                nf_params.append(f"--genome_path {genome_path}")
6146
6147            def splice_annotations(options: dict = {}, config: dict = {}) -> list:
6148                """
6149                Setting up updated databases for SPiP and SpliceAI
6150                """
6151
6152                try:
6153
6154                    # SpliceAI assembly transcriptome
6155                    spliceai_assembly = os.path.join(
6156                        config.get("folders", {})
6157                        .get("databases", {})
6158                        .get("spliceai", {}),
6159                        options.get("genome"),
6160                        "transcriptome",
6161                    )
6162                    spip_assembly = options.get("genome")
6163
6164                    spip = find(
6165                        f"transcriptome_{spip_assembly}.RData",
6166                        config.get("folders", {}).get("databases", {}).get("spip", {}),
6167                    )
6168                    spliceai = find("spliceai.refseq.txt", spliceai_assembly)
6169                    log.debug(f"SPiP annotations: {spip}")
6170                    log.debug(f"SpliceAI annotations: {spliceai}")
6171                    if spip and spliceai:
6172                        return [
6173                            f"--spip_transcriptome {spip}",
6174                            f"--spliceai_annotations {spliceai}",
6175                        ]
6176                    else:
6177                        # TODO crash and go on with basic annotations ?
6178                        # raise ValueError(
6179                        #     "Can't find splice databases in configuration EXIT"
6180                        # )
6181                        log.warning(
6182                            "Can't find splice databases in configuration, use annotations file from image"
6183                        )
6184                except TypeError:
6185                    log.warning(
6186                        "Can't find splice databases in configuration, use annotations file from image"
6187                    )
6188                    return []
6189
6190            # Add options, check if transcriptome option have already beend provided
6191            if (
6192                "spip_transcriptome" not in nf_params
6193                and "spliceai_transcriptome" not in nf_params
6194            ):
6195                splice_reference = splice_annotations(options, config)
6196                if splice_reference:
6197                    nf_params.extend(splice_reference)
6198
6199            nf_params.append(f"--output_folder {output_folder}")
6200
6201            random_uuid = f"HOWARD-SPLICE-{get_random()}"
6202            cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline"
6203            log.debug(cmd)
6204
6205            splice_config["docker"]["command"] = cmd
6206
6207            docker_cmd = get_bin_command(
6208                tool="splice",
6209                bin_type="docker",
6210                config=config,
6211                default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker",
6212                add_options=f"--name {random_uuid} {' '.join(mount)}",
6213            )
6214
6215            # Docker debug
6216            # if splice_config.get("rm_container"):
6217            #     rm_container = "--rm"
6218            # else:
6219            #     rm_container = ""
6220            # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}"
6221
6222            log.debug(docker_cmd)
6223            res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True)
6224            log.debug(res.stdout)
6225            if res.stderr:
6226                log.error(res.stderr)
6227            res.check_returncode()
6228        else:
6229            log.warning(f"Splice tool configuration not found: {config}")
6230
6231        # Update variants
6232        log.info("Annotation - Updating...")
6233        # Test find output vcf
6234        log.debug(
6235            f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
6236        )
6237        output_vcf = []
6238        # Wrong folder to look in
6239        for files in os.listdir(os.path.dirname(tmp_vcf_name)):
6240            if (
6241                files
6242                == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
6243            ):
6244                output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files))
6245        # log.debug(os.listdir(options.get("output_folder")))
6246        log.debug(f"Splice annotated vcf: {output_vcf[0]}")
6247        if not output_vcf:
6248            log.debug(
6249                f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz"
6250            )
6251        else:
6252            # Get new header from annotated vcf
6253            log.debug(f"Initial header: {len(header.infos)} fields")
6254            # Create new header with splice infos
6255            new_vcf = Variants(input=output_vcf[0])
6256            new_vcf_header = new_vcf.get_header().infos
6257            for keys, infos in new_vcf_header.items():
6258                if keys not in header.infos.keys():
6259                    header.infos[keys] = infos
6260            log.debug(f"New header: {len(header.infos)} fields")
6261            log.debug(f"Splice tmp output: {output_vcf[0]}")
6262            self.update_from_vcf(output_vcf[0])
6263
6264        # Remove folder
6265        remove_if_exists(output_folder)
6266
6267    ###
6268    # Prioritization
6269    ###
6270
6271    def get_config_default(self, name: str) -> dict:
6272        """
6273        The function `get_config_default` returns a dictionary containing default configurations for
6274        various calculations and prioritizations.
6275
6276        :param name: The `get_config_default` function returns a dictionary containing default
6277        configurations for different calculations and prioritizations. The `name` parameter is used to
6278        specify which specific configuration to retrieve from the dictionary
6279        :type name: str
6280        :return: The function `get_config_default` returns a dictionary containing default configuration
6281        settings for different calculations and prioritizations. The specific configuration settings are
6282        retrieved based on the input `name` parameter provided to the function. If the `name` parameter
6283        matches a key in the `config_default` dictionary, the corresponding configuration settings are
6284        returned. If there is no match, an empty dictionary is returned.
6285        """
6286
6287        config_default = {
6288            "calculations": {
6289                "variant_chr_pos_alt_ref": {
6290                    "type": "sql",
6291                    "name": "variant_chr_pos_alt_ref",
6292                    "description": "Create a variant ID with chromosome, position, alt and ref",
6293                    "available": False,
6294                    "output_column_name": "variant_chr_pos_alt_ref",
6295                    "output_column_type": "String",
6296                    "output_column_description": "variant ID with chromosome, position, alt and ref",
6297                    "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """,
6298                    "operation_info": True,
6299                },
6300                "VARTYPE": {
6301                    "type": "sql",
6302                    "name": "VARTYPE",
6303                    "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)",
6304                    "available": True,
6305                    "output_column_name": "VARTYPE",
6306                    "output_column_type": "String",
6307                    "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ",
6308                    "operation_query": """
6309                            CASE
6310                                WHEN "SVTYPE" NOT NULL THEN "SVTYPE"
6311                                WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV'
6312                                WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC'
6313                                WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV'
6314                                WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL'
6315                                ELSE 'UNDEFINED'
6316                            END
6317                            """,
6318                    "info_fields": ["SVTYPE"],
6319                    "operation_info": True,
6320                },
6321                "snpeff_hgvs": {
6322                    "type": "python",
6323                    "name": "snpeff_hgvs",
6324                    "description": "HGVS nomenclatures from snpEff annotation",
6325                    "available": True,
6326                    "function_name": "calculation_extract_snpeff_hgvs",
6327                    "function_params": ["snpeff_hgvs", "ANN"],
6328                },
6329                "snpeff_ann_explode": {
6330                    "type": "python",
6331                    "name": "snpeff_ann_explode",
6332                    "description": "Explode snpEff annotations with uniquify values",
6333                    "available": True,
6334                    "function_name": "calculation_snpeff_ann_explode",
6335                    "function_params": [False, "fields", "snpeff_", "ANN"],
6336                },
6337                "snpeff_ann_explode_uniquify": {
6338                    "type": "python",
6339                    "name": "snpeff_ann_explode_uniquify",
6340                    "description": "Explode snpEff annotations",
6341                    "available": True,
6342                    "function_name": "calculation_snpeff_ann_explode",
6343                    "function_params": [True, "fields", "snpeff_uniquify_", "ANN"],
6344                },
6345                "snpeff_ann_explode_json": {
6346                    "type": "python",
6347                    "name": "snpeff_ann_explode_json",
6348                    "description": "Explode snpEff annotations in JSON format",
6349                    "available": True,
6350                    "function_name": "calculation_snpeff_ann_explode",
6351                    "function_params": [False, "JSON", "snpeff_json", "ANN"],
6352                },
6353                "NOMEN": {
6354                    "type": "python",
6355                    "name": "NOMEN",
6356                    "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field",
6357                    "available": True,
6358                    "function_name": "calculation_extract_nomen",
6359                    "function_params": [],
6360                },
6361                "FINDBYPIPELINE": {
6362                    "type": "python",
6363                    "name": "FINDBYPIPELINE",
6364                    "description": "Number of pipeline that identify the variant (for multi pipeline VCF)",
6365                    "available": True,
6366                    "function_name": "calculation_find_by_pipeline",
6367                    "function_params": ["findbypipeline"],
6368                },
6369                "FINDBYSAMPLE": {
6370                    "type": "python",
6371                    "name": "FINDBYSAMPLE",
6372                    "description": "Number of sample that have a genotype for the variant (for multi sample VCF)",
6373                    "available": True,
6374                    "function_name": "calculation_find_by_pipeline",
6375                    "function_params": ["findbysample"],
6376                },
6377                "GENOTYPECONCORDANCE": {
6378                    "type": "python",
6379                    "name": "GENOTYPECONCORDANCE",
6380                    "description": "Concordance of genotype for multi caller VCF",
6381                    "available": True,
6382                    "function_name": "calculation_genotype_concordance",
6383                    "function_params": [],
6384                },
6385                "BARCODE": {
6386                    "type": "python",
6387                    "name": "BARCODE",
6388                    "description": "BARCODE as VaRank tool",
6389                    "available": True,
6390                    "function_name": "calculation_barcode",
6391                    "function_params": [],
6392                },
6393                "BARCODEFAMILY": {
6394                    "type": "python",
6395                    "name": "BARCODEFAMILY",
6396                    "description": "BARCODEFAMILY as VaRank tool",
6397                    "available": True,
6398                    "function_name": "calculation_barcode_family",
6399                    "function_params": ["BCF"],
6400                },
6401                "TRIO": {
6402                    "type": "python",
6403                    "name": "TRIO",
6404                    "description": "Inheritance for a trio family",
6405                    "available": True,
6406                    "function_name": "calculation_trio",
6407                    "function_params": [],
6408                },
6409                "VAF": {
6410                    "type": "python",
6411                    "name": "VAF",
6412                    "description": "Variant Allele Frequency (VAF) harmonization",
6413                    "available": True,
6414                    "function_name": "calculation_vaf_normalization",
6415                    "function_params": [],
6416                },
6417                "VAF_stats": {
6418                    "type": "python",
6419                    "name": "VAF_stats",
6420                    "description": "Variant Allele Frequency (VAF) statistics",
6421                    "available": True,
6422                    "function_name": "calculation_genotype_stats",
6423                    "function_params": ["VAF"],
6424                },
6425                "DP_stats": {
6426                    "type": "python",
6427                    "name": "DP_stats",
6428                    "description": "Depth (DP) statistics",
6429                    "available": True,
6430                    "function_name": "calculation_genotype_stats",
6431                    "function_params": ["DP"],
6432                },
6433                "variant_id": {
6434                    "type": "python",
6435                    "name": "variant_id",
6436                    "description": "Variant ID generated from variant position and type",
6437                    "available": True,
6438                    "function_name": "calculation_variant_id",
6439                    "function_params": [],
6440                },
6441                "transcripts_json": {
6442                    "type": "python",
6443                    "name": "transcripts_json",
6444                    "description": "Add transcripts info in JSON format (field 'transcripts_json')",
6445                    "available": True,
6446                    "function_name": "calculation_transcripts_json",
6447                    "function_params": ["transcripts_json"],
6448                },
6449            },
6450            "prioritizations": {
6451                "default": {
6452                    "filter": [
6453                        {
6454                            "type": "notequals",
6455                            "value": "!PASS|\\.",
6456                            "score": 0,
6457                            "flag": "FILTERED",
6458                            "comment": ["Bad variant quality"],
6459                        },
6460                        {
6461                            "type": "equals",
6462                            "value": "REJECT",
6463                            "score": -20,
6464                            "flag": "PASS",
6465                            "comment": ["Bad variant quality"],
6466                        },
6467                    ],
6468                    "DP": [
6469                        {
6470                            "type": "gte",
6471                            "value": "50",
6472                            "score": 5,
6473                            "flag": "PASS",
6474                            "comment": ["DP higher than 50"],
6475                        }
6476                    ],
6477                    "ANN": [
6478                        {
6479                            "type": "contains",
6480                            "value": "HIGH",
6481                            "score": 5,
6482                            "flag": "PASS",
6483                            "comment": [
6484                                "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay"
6485                            ],
6486                        },
6487                        {
6488                            "type": "contains",
6489                            "value": "MODERATE",
6490                            "score": 3,
6491                            "flag": "PASS",
6492                            "comment": [
6493                                "A non-disruptive variant that might change protein effectiveness"
6494                            ],
6495                        },
6496                        {
6497                            "type": "contains",
6498                            "value": "LOW",
6499                            "score": 0,
6500                            "flag": "FILTERED",
6501                            "comment": [
6502                                "Assumed to be mostly harmless or unlikely to change protein behavior"
6503                            ],
6504                        },
6505                        {
6506                            "type": "contains",
6507                            "value": "MODIFIER",
6508                            "score": 0,
6509                            "flag": "FILTERED",
6510                            "comment": [
6511                                "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact"
6512                            ],
6513                        },
6514                    ],
6515                }
6516            },
6517        }
6518
6519        return config_default.get(name, None)
6520
6521    def get_config_json(
6522        self, name: str, config_dict: dict = {}, config_file: str = None
6523    ) -> dict:
6524        """
6525        The function `get_config_json` retrieves a configuration JSON object with prioritizations from
6526        default values, a dictionary, and a file.
6527
6528        :param name: The `name` parameter in the `get_config_json` function is a string that represents
6529        the name of the configuration. It is used to identify and retrieve the configuration settings
6530        for a specific component or module
6531        :type name: str
6532        :param config_dict: The `config_dict` parameter in the `get_config_json` function is a
6533        dictionary that allows you to provide additional configuration settings or overrides. When you
6534        call the `get_config_json` function, you can pass a dictionary containing key-value pairs where
6535        the key is the configuration setting you want to override or
6536        :type config_dict: dict
6537        :param config_file: The `config_file` parameter in the `get_config_json` function is used to
6538        specify the path to a configuration file that contains additional settings. If provided, the
6539        function will read the contents of this file and update the configuration dictionary with the
6540        values found in the file, overriding any existing values with the
6541        :type config_file: str
6542        :return: The function `get_config_json` returns a dictionary containing the configuration
6543        settings.
6544        """
6545
6546        # Create with default prioritizations
6547        config_default = self.get_config_default(name=name)
6548        configuration = config_default
6549        # log.debug(f"configuration={configuration}")
6550
6551        # Replace prioritizations from dict
6552        for config in config_dict:
6553            configuration[config] = config_dict[config]
6554
6555        # Replace prioritizations from file
6556        config_file = full_path(config_file)
6557        if config_file:
6558            if os.path.exists(config_file):
6559                with open(config_file) as config_file_content:
6560                    config_file_dict = json.load(config_file_content)
6561                for config in config_file_dict:
6562                    configuration[config] = config_file_dict[config]
6563            else:
6564                msg_error = f"Config '{name}' file '{config_file}' does NOT exist"
6565                log.error(msg_error)
6566                raise ValueError(msg_error)
6567
6568        return configuration
6569
6570    def prioritization(self) -> None:
6571        """
6572        It takes a VCF file, and adds a bunch of new INFO fields to it, based on the values of other
6573        INFO fields
6574        """
6575
6576        # Config
6577        config = self.get_config()
6578
6579        # Param
6580        param = self.get_param()
6581
6582        # Quick Prioritizations
6583        # prioritizations = param.get("prioritization", {}).get("prioritizations", "")
6584
6585        # Configuration profiles
6586        prioritization_config_file = param.get("prioritization", {}).get(
6587            "prioritization_config", None
6588        )
6589        prioritization_config_file = full_path(prioritization_config_file)
6590        prioritizations_config = self.get_config_json(
6591            name="prioritizations", config_file=prioritization_config_file
6592        )
6593
6594        # Prioritization options
6595        profiles = param.get("prioritization", {}).get("profiles", [])
6596        if isinstance(profiles, str):
6597            profiles = profiles.split(",")
6598        pzfields = param.get("prioritization", {}).get(
6599            "pzfields", ["PZFlag", "PZScore"]
6600        )
6601        if isinstance(pzfields, str):
6602            pzfields = pzfields.split(",")
6603        default_profile = param.get("prioritization", {}).get("default_profile", None)
6604        pzfields_sep = param.get("prioritization", {}).get("pzfields_sep", "_")
6605        prioritization_score_mode = param.get("prioritization", {}).get(
6606            "prioritization_score_mode", "HOWARD"
6607        )
6608
6609        # Quick Prioritizations
6610        # prioritizations = param.get("prioritization", {}).get("prioritizations", None)
6611        prioritizations = param.get("prioritizations", None)
6612        if prioritizations:
6613            log.info("Quick Prioritization:")
6614            for profile in prioritizations.split(","):
6615                if profile not in profiles:
6616                    profiles.append(profile)
6617                    log.info(f"   {profile}")
6618
6619        # If profile "ALL" provided, all profiles in the config profiles
6620        if "ALL" in profiles:
6621            profiles = list(prioritizations_config.keys())
6622
6623        for profile in profiles:
6624            if prioritizations_config.get(profile, None):
6625                log.debug(f"Profile '{profile}' configured")
6626            else:
6627                msg_error = f"Profile '{profile}' NOT configured"
6628                log.error(msg_error)
6629                raise ValueError(msg_error)
6630
6631        if profiles:
6632            log.info(f"Prioritization... ")
6633        else:
6634            log.debug(f"No profile defined")
6635            return
6636
6637        if not default_profile and len(profiles):
6638            default_profile = profiles[0]
6639
6640        log.debug("Profiles availables: " + str(list(prioritizations_config.keys())))
6641        log.debug("Profiles to check: " + str(list(profiles)))
6642
6643        # Variables
6644        table_variants = self.get_table_variants(clause="update")
6645
6646        # Added columns
6647        added_columns = []
6648
6649        # Create list of PZfields
6650        # List of PZFields
6651        list_of_pzfields_original = pzfields + [
6652            pzfield + pzfields_sep + profile
6653            for pzfield in pzfields
6654            for profile in profiles
6655        ]
6656        list_of_pzfields = []
6657        log.debug(f"{list_of_pzfields_original}")
6658
6659        # Remove existing PZfields to use if exists
6660        for pzfield in list_of_pzfields_original:
6661            if self.get_header().infos.get(pzfield, None) is None:
6662                list_of_pzfields.append(pzfield)
6663                log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF")
6664            else:
6665                log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF")
6666
6667        if list_of_pzfields:
6668
6669            # Explode Infos fields
6670            explode_infos_prefix = self.get_explode_infos_prefix()
6671            added_columns += self.explode_infos(prefix=explode_infos_prefix)
6672            extra_infos = self.get_extra_infos()
6673
6674            # PZfields tags description
6675            PZfields_INFOS = {
6676                "PZTags": {
6677                    "ID": "PZTags",
6678                    "Number": ".",
6679                    "Type": "String",
6680                    "Description": "Variant tags based on annotation criteria",
6681                },
6682                "PZScore": {
6683                    "ID": "PZScore",
6684                    "Number": 1,
6685                    "Type": "Integer",
6686                    "Description": "Variant score based on annotation criteria",
6687                },
6688                "PZFlag": {
6689                    "ID": "PZFlag",
6690                    "Number": 1,
6691                    "Type": "String",
6692                    "Description": "Variant flag based on annotation criteria",
6693                },
6694                "PZComment": {
6695                    "ID": "PZComment",
6696                    "Number": ".",
6697                    "Type": "String",
6698                    "Description": "Variant comment based on annotation criteria",
6699                },
6700                "PZInfos": {
6701                    "ID": "PZInfos",
6702                    "Number": ".",
6703                    "Type": "String",
6704                    "Description": "Variant infos based on annotation criteria",
6705                },
6706            }
6707
6708            # Create INFO fields if not exist
6709            for field in PZfields_INFOS:
6710                field_ID = PZfields_INFOS[field]["ID"]
6711                field_description = PZfields_INFOS[field]["Description"]
6712                if field_ID not in self.get_header().infos and field_ID in pzfields:
6713                    field_description = (
6714                        PZfields_INFOS[field]["Description"]
6715                        + f", profile {default_profile}"
6716                    )
6717                    self.get_header().infos[field_ID] = vcf.parser._Info(
6718                        field_ID,
6719                        PZfields_INFOS[field]["Number"],
6720                        PZfields_INFOS[field]["Type"],
6721                        field_description,
6722                        "unknown",
6723                        "unknown",
6724                        code_type_map[PZfields_INFOS[field]["Type"]],
6725                    )
6726
6727            # Create INFO fields if not exist for each profile
6728            for profile in prioritizations_config:
6729                if profile in profiles or profiles == []:
6730                    for field in PZfields_INFOS:
6731                        field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile
6732                        field_description = (
6733                            PZfields_INFOS[field]["Description"]
6734                            + f", profile {profile}"
6735                        )
6736                        if (
6737                            field_ID not in self.get_header().infos
6738                            and field in pzfields
6739                        ):
6740                            self.get_header().infos[field_ID] = vcf.parser._Info(
6741                                field_ID,
6742                                PZfields_INFOS[field]["Number"],
6743                                PZfields_INFOS[field]["Type"],
6744                                field_description,
6745                                "unknown",
6746                                "unknown",
6747                                code_type_map[PZfields_INFOS[field]["Type"]],
6748                            )
6749
6750            # Header
6751            for pzfield in list_of_pzfields:
6752                if re.match("PZScore.*", pzfield):
6753                    added_column = self.add_column(
6754                        table_name=table_variants,
6755                        column_name=pzfield,
6756                        column_type="INTEGER",
6757                        default_value="0",
6758                    )
6759                elif re.match("PZFlag.*", pzfield):
6760                    added_column = self.add_column(
6761                        table_name=table_variants,
6762                        column_name=pzfield,
6763                        column_type="BOOLEAN",
6764                        default_value="1",
6765                    )
6766                else:
6767                    added_column = self.add_column(
6768                        table_name=table_variants,
6769                        column_name=pzfield,
6770                        column_type="STRING",
6771                        default_value="''",
6772                    )
6773                added_columns.append(added_column)
6774
6775            # Profiles
6776            if profiles:
6777
6778                # foreach profile in configuration file
6779                for profile in prioritizations_config:
6780
6781                    # If profile is asked in param, or ALL are asked (empty profile [])
6782                    if profile in profiles or profiles == []:
6783                        log.info(f"Profile '{profile}'")
6784
6785                        sql_set_info_option = ""
6786
6787                        sql_set_info = []
6788
6789                        # PZ fields set
6790
6791                        # PZScore
6792                        if f"PZScore{pzfields_sep}{profile}" in list_of_pzfields:
6793                            sql_set_info.append(
6794                                f"""
6795                                    concat(
6796                                        'PZScore{pzfields_sep}{profile}=',
6797                                        PZScore{pzfields_sep}{profile}
6798                                    ) 
6799                                """
6800                            )
6801                            if (
6802                                profile == default_profile
6803                                and "PZScore" in list_of_pzfields
6804                            ):
6805                                sql_set_info.append(
6806                                    f"""
6807                                        concat(
6808                                            'PZScore=',
6809                                            PZScore{pzfields_sep}{profile}
6810                                        )
6811                                    """
6812                                )
6813
6814                        # PZFlag
6815                        if f"PZFlag{pzfields_sep}{profile}" in list_of_pzfields:
6816                            sql_set_info.append(
6817                                f"""
6818                                    concat(
6819                                        'PZFlag{pzfields_sep}{profile}=',
6820                                        CASE 
6821                                            WHEN PZFlag{pzfields_sep}{profile}==1
6822                                            THEN 'PASS'
6823                                            WHEN PZFlag{pzfields_sep}{profile}==0
6824                                            THEN 'FILTERED'
6825                                        END
6826                                    ) 
6827                                """
6828                            )
6829                            if (
6830                                profile == default_profile
6831                                and "PZFlag" in list_of_pzfields
6832                            ):
6833                                sql_set_info.append(
6834                                    f"""
6835                                        concat(
6836                                            'PZFlag=',
6837                                            CASE 
6838                                                WHEN PZFlag{pzfields_sep}{profile}==1
6839                                                THEN 'PASS'
6840                                                WHEN PZFlag{pzfields_sep}{profile}==0
6841                                                THEN 'FILTERED'
6842                                            END
6843                                        )
6844                                    """
6845                                )
6846
6847                        # PZComment
6848                        if f"PZComment{pzfields_sep}{profile}" in list_of_pzfields:
6849                            sql_set_info.append(
6850                                f"""
6851                                    CASE
6852                                        WHEN PZComment{pzfields_sep}{profile} NOT IN ('')
6853                                        THEN concat('PZComment{pzfields_sep}{profile}=', PZComment{pzfields_sep}{profile})
6854                                        ELSE ''
6855                                    END
6856                                """
6857                            )
6858                            if (
6859                                profile == default_profile
6860                                and "PZComment" in list_of_pzfields
6861                            ):
6862                                sql_set_info.append(
6863                                    f"""
6864                                        CASE
6865                                            WHEN PZComment{pzfields_sep}{profile} NOT IN ('')
6866                                            THEN concat('PZComment=', PZComment{pzfields_sep}{profile})
6867                                            ELSE ''
6868                                        END
6869                                    """
6870                                )
6871
6872                        # PZInfos
6873                        if f"PZInfos{pzfields_sep}{profile}" in list_of_pzfields:
6874                            sql_set_info.append(
6875                                f"""
6876                                    CASE
6877                                        WHEN PZInfos{pzfields_sep}{profile} NOT IN ('')
6878                                        THEN concat('PZInfos{pzfields_sep}{profile}=', PZInfos{pzfields_sep}{profile})
6879                                        ELSE ''
6880                                    END
6881                                """
6882                            )
6883                            if (
6884                                profile == default_profile
6885                                and "PZInfos" in list_of_pzfields
6886                            ):
6887                                sql_set_info.append(
6888                                    f"""
6889                                        CASE
6890                                            WHEN PZInfos{pzfields_sep}{profile} NOT IN ('')
6891                                            THEN concat('PZInfos=', PZInfos{pzfields_sep}{profile})
6892                                            ELSE ''
6893                                        END
6894                                    """
6895                                )
6896
6897                        # Merge PZfields
6898                        sql_set_info_option = ""
6899                        sql_set_sep = ""
6900                        for sql_set in sql_set_info:
6901                            if sql_set_sep:
6902                                sql_set_info_option += f"""
6903                                    , concat('{sql_set_sep}', {sql_set})
6904                                """
6905                            else:
6906                                sql_set_info_option += f"""
6907                                    , {sql_set}
6908                                """
6909                            sql_set_sep = ";"
6910
6911                        sql_queries = []
6912                        for annotation in prioritizations_config[profile]:
6913
6914                            # Check if annotation field is present
6915                            if not f"{explode_infos_prefix}{annotation}" in extra_infos:
6916                                log.debug(f"Annotation '{annotation}' not in data")
6917                                continue
6918                            else:
6919                                log.debug(f"Annotation '{annotation}' in data")
6920
6921                            # For each criterions
6922                            for criterion in prioritizations_config[profile][
6923                                annotation
6924                            ]:
6925                                criterion_type = criterion["type"]
6926                                criterion_value = criterion["value"]
6927                                criterion_score = criterion.get("score", 0)
6928                                criterion_flag = criterion.get("flag", "PASS")
6929                                criterion_flag_bool = criterion_flag == "PASS"
6930                                criterion_comment = (
6931                                    ", ".join(criterion.get("comment", []))
6932                                    .replace("'", "''")
6933                                    .replace(";", ",")
6934                                    .replace("\t", " ")
6935                                )
6936                                criterion_infos = (
6937                                    str(criterion)
6938                                    .replace("'", "''")
6939                                    .replace(";", ",")
6940                                    .replace("\t", " ")
6941                                )
6942
6943                                sql_set = []
6944                                sql_set_info = []
6945
6946                                # PZ fields set
6947                                if (
6948                                    f"PZScore{pzfields_sep}{profile}"
6949                                    in list_of_pzfields
6950                                ):
6951                                    if prioritization_score_mode == "HOWARD":
6952                                        sql_set.append(
6953                                            f"PZScore{pzfields_sep}{profile} = PZScore{pzfields_sep}{profile} + {criterion_score}"
6954                                        )
6955                                    elif prioritization_score_mode == "VaRank":
6956                                        sql_set.append(
6957                                            f"PZScore{pzfields_sep}{profile} = CASE WHEN {criterion_score}>PZScore{pzfields_sep}{profile} THEN {criterion_score} END"
6958                                        )
6959                                    else:
6960                                        sql_set.append(
6961                                            f"PZScore{pzfields_sep}{profile} = PZScore{pzfields_sep}{profile} + {criterion_score}"
6962                                        )
6963                                if f"PZFlag{pzfields_sep}{profile}" in list_of_pzfields:
6964                                    sql_set.append(
6965                                        f"PZFlag{pzfields_sep}{profile} = PZFlag{pzfields_sep}{profile} AND {criterion_flag_bool}"
6966                                    )
6967                                if (
6968                                    f"PZComment{pzfields_sep}{profile}"
6969                                    in list_of_pzfields
6970                                ):
6971                                    sql_set.append(
6972                                        f"""
6973                                            PZComment{pzfields_sep}{profile} = 
6974                                                concat(
6975                                                    PZComment{pzfields_sep}{profile},
6976                                                    CASE 
6977                                                        WHEN PZComment{pzfields_sep}{profile}!=''
6978                                                        THEN ', '
6979                                                        ELSE ''
6980                                                    END,
6981                                                    '{criterion_comment}'
6982                                                )
6983                                        """
6984                                    )
6985                                if (
6986                                    f"PZInfos{pzfields_sep}{profile}"
6987                                    in list_of_pzfields
6988                                ):
6989                                    sql_set.append(
6990                                        f"""
6991                                            PZInfos{pzfields_sep}{profile} = 
6992                                                concat(
6993                                                    PZInfos{pzfields_sep}{profile},
6994                                                    '{criterion_infos}'
6995                                                )
6996                                        """
6997                                    )
6998                                sql_set_option = ",".join(sql_set)
6999
7000                                # Criterion and comparison
7001                                try:
7002                                    float(criterion_value)
7003                                    sql_update = f"""
7004                                        UPDATE {table_variants}
7005                                        SET {sql_set_option}
7006                                        WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.')
7007                                        AND "{explode_infos_prefix}{annotation}"{comparison_map[criterion_type]}{criterion_value}
7008                                        """
7009                                except:
7010                                    contains_option = ""
7011                                    if criterion_type == "contains":
7012                                        contains_option = ".*"
7013                                    sql_update = f"""
7014                                        UPDATE {table_variants}
7015                                        SET {sql_set_option}
7016                                        WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}'
7017                                        """
7018                                sql_queries.append(sql_update)
7019
7020                        # PZTags
7021                        if f"PZTags{pzfields_sep}{profile}" in list_of_pzfields:
7022
7023                            # Create PZFalgs value
7024                            pztags_value = ""
7025                            pztags_sep_default = "|"
7026                            pztags_sep = ""
7027                            for pzfield in pzfields:
7028                                if pzfield not in ["PZTags"]:
7029                                    if (
7030                                        f"{pzfield}{pzfields_sep}{profile}"
7031                                        in list_of_pzfields
7032                                    ):
7033                                        if pzfield in ["PZFlag"]:
7034                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
7035                                                CASE WHEN PZFlag{pzfields_sep}{profile}
7036                                                    THEN 'PASS'
7037                                                    ELSE 'FILTERED'
7038                                                END, '"""
7039                                        else:
7040                                            pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '"
7041                                        pztags_sep = pztags_sep_default
7042
7043                            # Add Query update for PZFlags
7044                            sql_update_pztags = f"""
7045                                UPDATE {table_variants}
7046                                SET INFO = concat(
7047                                        INFO,
7048                                        CASE WHEN INFO NOT in ('','.')
7049                                                THEN ';'
7050                                                ELSE ''
7051                                        END,
7052                                        'PZTags{pzfields_sep}{profile}={pztags_value}'
7053                                    )
7054                                """
7055                            sql_queries.append(sql_update_pztags)
7056
7057                            # Add Query update for PZFlags for default
7058                            if profile == default_profile:
7059                                sql_update_pztags_default = f"""
7060                                UPDATE {table_variants}
7061                                SET INFO = concat(
7062                                        INFO,
7063                                        ';',
7064                                        'PZTags={pztags_value}'
7065                                    )
7066                                """
7067                                sql_queries.append(sql_update_pztags_default)
7068
7069                        log.info(f"""Profile '{profile}' - Prioritization... """)
7070
7071                        if sql_queries:
7072
7073                            for sql_query in sql_queries:
7074                                log.debug(
7075                                    f"""Profile '{profile}' - Prioritization query: {sql_query}... """
7076                                )
7077                                self.conn.execute(sql_query)
7078
7079                        log.info(f"""Profile '{profile}' - Update... """)
7080                        sql_query_update = f"""
7081                            UPDATE {table_variants}
7082                            SET INFO =  
7083                                concat(
7084                                    CASE
7085                                        WHEN INFO NOT IN ('','.')
7086                                        THEN concat(INFO, ';')
7087                                        ELSE ''
7088                                    END
7089                                    {sql_set_info_option}
7090                                )
7091                        """
7092                        self.conn.execute(sql_query_update)
7093
7094        else:
7095
7096            log.warning(f"No profiles in parameters")
7097
7098        # Remove added columns
7099        for added_column in added_columns:
7100            self.drop_column(column=added_column)
7101
7102        # Explode INFOS fields into table fields
7103        if self.get_explode_infos():
7104            self.explode_infos(
7105                prefix=self.get_explode_infos_prefix(),
7106                fields=self.get_explode_infos_fields(),
7107                force=True,
7108            )
7109
7110        return
7111
7112    ###
7113    # HGVS
7114    ###
7115
7116    def annotation_hgvs(self, threads: int = None) -> None:
7117        """
7118        The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic
7119        coordinates and alleles.
7120
7121        :param threads: The `threads` parameter is an optional integer that specifies the number of
7122        threads to use for parallel processing. If no value is provided, it will default to the number
7123        of threads obtained from the `get_threads()` method
7124        :type threads: int
7125        """
7126
7127        # Function for each partition of the Dask Dataframe
7128        def partition_function(partition):
7129            """
7130            The function `partition_function` applies the `annotation_hgvs_partition` function to
7131            each row of a DataFrame called `partition`.
7132
7133            :param partition: The parameter "partition" is a pandas DataFrame that contains the data
7134            to be processed
7135            :return: the result of applying the "annotation_hgvs_partition" function to each row of
7136            the "partition" dataframe along the axis 1.
7137            """
7138            return partition.apply(annotation_hgvs_partition, axis=1)
7139
7140        def annotation_hgvs_partition(row) -> str:
7141            """
7142            The function `annotation_hgvs_partition` takes in a row of data and returns a string
7143            containing a list of HGVS names associated with the given genomic coordinates and alleles.
7144
7145            :param row: A dictionary-like object that contains the values for the following keys:
7146            :return: a string that contains the HGVS names associated with the given row of data.
7147            """
7148
7149            chr = row["CHROM"]
7150            pos = row["POS"]
7151            ref = row["REF"]
7152            alt = row["ALT"]
7153
7154            # Find list of associated transcripts
7155            transcripts_list = list(
7156                polars_conn.execute(
7157                    f"""
7158                SELECT transcript
7159                FROM refseq_df
7160                WHERE CHROM='{chr}'
7161                AND POS={pos}
7162            """
7163                )["transcript"]
7164            )
7165
7166            # Full HGVS annotation in list
7167            hgvs_full_list = []
7168
7169            for transcript_name in transcripts_list:
7170
7171                # Transcript
7172                transcript = get_transcript(
7173                    transcripts=transcripts, transcript_name=transcript_name
7174                )
7175                # Exon
7176                if use_exon:
7177                    exon = transcript.find_exon_number(pos)
7178                else:
7179                    exon = None
7180                # Protein
7181                transcript_protein = None
7182                if use_protein or add_protein or full_format:
7183                    transcripts_protein = list(
7184                        polars_conn.execute(
7185                            f"""
7186                        SELECT protein
7187                        FROM refseqlink_df
7188                        WHERE transcript='{transcript_name}'
7189                        LIMIT 1
7190                    """
7191                        )["protein"]
7192                    )
7193                    if len(transcripts_protein):
7194                        transcript_protein = transcripts_protein[0]
7195
7196                # HGVS name
7197                hgvs_name = format_hgvs_name(
7198                    chr,
7199                    pos,
7200                    ref,
7201                    alt,
7202                    genome=genome,
7203                    transcript=transcript,
7204                    transcript_protein=transcript_protein,
7205                    exon=exon,
7206                    use_gene=use_gene,
7207                    use_protein=use_protein,
7208                    full_format=full_format,
7209                    use_version=use_version,
7210                    codon_type=codon_type,
7211                )
7212                hgvs_full_list.append(hgvs_name)
7213                if add_protein and not use_protein and not full_format:
7214                    hgvs_name = format_hgvs_name(
7215                        chr,
7216                        pos,
7217                        ref,
7218                        alt,
7219                        genome=genome,
7220                        transcript=transcript,
7221                        transcript_protein=transcript_protein,
7222                        exon=exon,
7223                        use_gene=use_gene,
7224                        use_protein=True,
7225                        full_format=False,
7226                        use_version=use_version,
7227                        codon_type=codon_type,
7228                    )
7229                    hgvs_full_list.append(hgvs_name)
7230
7231            # Create liste of HGVS annotations
7232            hgvs_full = ",".join(hgvs_full_list)
7233
7234            return hgvs_full
7235
7236        # Polars connexion
7237        polars_conn = pl.SQLContext(register_globals=True, eager=True)
7238
7239        # Config
7240        config = self.get_config()
7241
7242        # Databases
7243        # Genome
7244        databases_genomes_folders = (
7245            config.get("folders", {})
7246            .get("databases", {})
7247            .get("genomes", DEFAULT_GENOME_FOLDER)
7248        )
7249        databases_genome = (
7250            config.get("folders", {}).get("databases", {}).get("genomes", "")
7251        )
7252        # refseq database folder
7253        databases_refseq_folders = (
7254            config.get("folders", {})
7255            .get("databases", {})
7256            .get("refseq", DEFAULT_REFSEQ_FOLDER)
7257        )
7258        # refseq
7259        databases_refseq = config.get("databases", {}).get("refSeq", None)
7260        # refSeqLink
7261        databases_refseqlink = config.get("databases", {}).get("refSeqLink", None)
7262
7263        # Param
7264        param = self.get_param()
7265
7266        # Quick HGVS
7267        if "hgvs_options" in param and param.get("hgvs_options", ""):
7268            log.info(f"Quick HGVS Annotation:")
7269            if not param.get("hgvs", None):
7270                param["hgvs"] = {}
7271            for option in param.get("hgvs_options", "").split(","):
7272                option_var_val = option.split("=")
7273                option_var = option_var_val[0]
7274                if len(option_var_val) > 1:
7275                    option_val = option_var_val[1]
7276                else:
7277                    option_val = "True"
7278                if option_val.upper() in ["TRUE"]:
7279                    option_val = True
7280                elif option_val.upper() in ["FALSE"]:
7281                    option_val = False
7282                log.info(f"   {option_var}={option_val}")
7283                param["hgvs"][option_var] = option_val
7284
7285        # Check if HGVS annotation enabled
7286        if "hgvs" in param:
7287            log.info(f"HGVS Annotation... ")
7288            for hgvs_option in param.get("hgvs", {}):
7289                log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}")
7290        else:
7291            return
7292
7293        # HGVS Param
7294        param_hgvs = param.get("hgvs", {})
7295        use_exon = param_hgvs.get("use_exon", False)
7296        use_gene = param_hgvs.get("use_gene", False)
7297        use_protein = param_hgvs.get("use_protein", False)
7298        add_protein = param_hgvs.get("add_protein", False)
7299        full_format = param_hgvs.get("full_format", False)
7300        use_version = param_hgvs.get("use_version", False)
7301        codon_type = param_hgvs.get("codon_type", "3")
7302
7303        # refSseq refSeqLink
7304        databases_refseq = param_hgvs.get("refseq", databases_refseq)
7305        databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink)
7306
7307        # Assembly
7308        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
7309
7310        # Genome
7311        genome_file = None
7312        if find_genome(databases_genome):
7313            genome_file = find_genome(databases_genome)
7314        else:
7315            genome_file = find_genome(
7316                genome_path=databases_genomes_folders, assembly=assembly
7317            )
7318        log.debug("Genome: " + str(genome_file))
7319
7320        # refSseq
7321        refseq_file = find_file_prefix(
7322            input_file=databases_refseq,
7323            prefix="ncbiRefSeq",
7324            folder=databases_refseq_folders,
7325            assembly=assembly,
7326        )
7327        log.debug("refSeq: " + str(refseq_file))
7328
7329        # refSeqLink
7330        refseqlink_file = find_file_prefix(
7331            input_file=databases_refseqlink,
7332            prefix="ncbiRefSeqLink",
7333            folder=databases_refseq_folders,
7334            assembly=assembly,
7335        )
7336        log.debug("refSeqLink: " + str(refseqlink_file))
7337
7338        # Threads
7339        if not threads:
7340            threads = self.get_threads()
7341        log.debug("Threads: " + str(threads))
7342
7343        # Variables
7344        table_variants = self.get_table_variants(clause="update")
7345
7346        # Get variants SNV and InDel only
7347        query_variants = f"""
7348            SELECT "#CHROM" AS CHROM, POS, REF, ALT
7349            FROM {table_variants}
7350            WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$'
7351            """
7352        df_variants = self.get_query_to_df(query_variants)
7353
7354        # Added columns
7355        added_columns = []
7356
7357        # Add hgvs column in variants table
7358        hgvs_column_name = "hgvs_" + str(random.randrange(1000))
7359        added_column = self.add_column(
7360            table_variants, hgvs_column_name, "STRING", default_value=None
7361        )
7362        added_columns.append(added_column)
7363
7364        log.debug(f"refSeq loading...")
7365        # refSeq in duckDB
7366        refseq_table = get_refseq_table(
7367            conn=self.conn, refseq_table="refseq", refseq_file=refseq_file
7368        )
7369        # Loading all refSeq in Dataframe
7370        refseq_query = f"""
7371            SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript
7372            FROM {refseq_table}
7373            JOIN df_variants ON (
7374                {refseq_table}.chrom = df_variants.CHROM
7375                AND {refseq_table}.txStart<=df_variants.POS
7376                AND {refseq_table}.txEnd>=df_variants.POS
7377            )
7378        """
7379        refseq_df = self.conn.query(refseq_query).pl()
7380
7381        if refseqlink_file:
7382            log.debug(f"refSeqLink loading...")
7383            # refSeqLink in duckDB
7384            refseqlink_table = get_refseq_table(
7385                conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file
7386            )
7387            # Loading all refSeqLink in Dataframe
7388            protacc_column = "protAcc_with_ver"
7389            mrnaacc_column = "mrnaAcc_with_ver"
7390            refseqlink_query = f"""
7391                SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript
7392                FROM {refseqlink_table} 
7393                JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver)
7394                WHERE protAcc_without_ver IS NOT NULL
7395            """
7396            # Polars Dataframe
7397            refseqlink_df = self.conn.query(f"{refseqlink_query}").pl()
7398
7399        # Read RefSeq transcripts into a python dict/model.
7400        log.debug(f"Transcripts loading...")
7401        with tempfile.TemporaryDirectory() as tmpdir:
7402            transcripts_query = f"""
7403                COPY (
7404                    SELECT {refseq_table}.*
7405                    FROM {refseq_table}
7406                    JOIN df_variants ON (
7407                        {refseq_table}.chrom=df_variants.CHROM
7408                        AND {refseq_table}.txStart<=df_variants.POS
7409                        AND {refseq_table}.txEnd>=df_variants.POS
7410                    )
7411                )
7412                TO '{tmpdir}/transcript.tsv' (DELIMITER '\t');
7413            """
7414            self.conn.query(transcripts_query)
7415            with open(f"{tmpdir}/transcript.tsv") as infile:
7416                transcripts = read_transcripts(infile)
7417
7418        # Polars connexion
7419        polars_conn = pl.SQLContext(register_globals=True, eager=True)
7420
7421        log.debug("Genome loading...")
7422        # Read genome sequence using pyfaidx.
7423        genome = Fasta(genome_file)
7424
7425        log.debug("Start annotation HGVS...")
7426
7427        # Create
7428        # a Dask Dataframe from Pandas dataframe with partition as number of threads
7429        ddf = dd.from_pandas(df_variants, npartitions=threads)
7430
7431        # Use dask.dataframe.apply() to apply function on each partition
7432        ddf[hgvs_column_name] = ddf.map_partitions(partition_function)
7433
7434        # Convert Dask DataFrame to Pandas Dataframe
7435        df = ddf.compute()
7436
7437        # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???)
7438        with tempfile.TemporaryDirectory() as tmpdir:
7439            df_parquet = os.path.join(tmpdir, "df.parquet")
7440            df.to_parquet(df_parquet)
7441
7442            # Update hgvs column
7443            update_variant_query = f"""
7444                UPDATE {table_variants}
7445                SET "{hgvs_column_name}"=df."{hgvs_column_name}"
7446                FROM read_parquet('{df_parquet}') as df
7447                WHERE variants."#CHROM" = df.CHROM
7448                AND variants.POS = df.POS
7449                AND variants.REF = df.REF
7450                AND variants.ALT = df.ALT
7451                AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL
7452                """
7453            self.execute_query(update_variant_query)
7454
7455        # Update INFO column
7456        sql_query_update = f"""
7457            UPDATE {table_variants}
7458            SET INFO = 
7459                concat(
7460                    CASE 
7461                        WHEN INFO NOT IN ('','.')
7462                        THEN concat(INFO, ';')
7463                        ELSE ''
7464                    END,
7465                    'hgvs=',
7466                    {hgvs_column_name}
7467                )
7468            WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL
7469            """
7470        self.execute_query(sql_query_update)
7471
7472        # Add header
7473        HGVS_INFOS = {
7474            "hgvs": {
7475                "ID": "hgvs",
7476                "Number": ".",
7477                "Type": "String",
7478                "Description": f"HGVS annotatation with HOWARD",
7479            }
7480        }
7481
7482        for field in HGVS_INFOS:
7483            field_ID = HGVS_INFOS[field]["ID"]
7484            field_description = HGVS_INFOS[field]["Description"]
7485            self.get_header().infos[field_ID] = vcf.parser._Info(
7486                field_ID,
7487                HGVS_INFOS[field]["Number"],
7488                HGVS_INFOS[field]["Type"],
7489                field_description,
7490                "unknown",
7491                "unknown",
7492                code_type_map[HGVS_INFOS[field]["Type"]],
7493            )
7494
7495        # Remove added columns
7496        for added_column in added_columns:
7497            self.drop_column(column=added_column)
7498
7499    ###
7500    # Calculation
7501    ###
7502
7503    def get_operations_help(
7504        self, operations_config_dict: dict = {}, operations_config_file: str = None
7505    ) -> list:
7506
7507        # Init
7508        operations_help = []
7509
7510        # operations
7511        operations = self.get_config_json(
7512            name="calculations",
7513            config_dict=operations_config_dict,
7514            config_file=operations_config_file,
7515        )
7516        for op in operations:
7517            op_name = operations[op].get("name", op).upper()
7518            op_description = operations[op].get("description", op_name)
7519            op_available = operations[op].get("available", False)
7520            if op_available:
7521                operations_help.append(f"   {op_name}: {op_description}")
7522
7523        # Sort operations
7524        operations_help.sort()
7525
7526        # insert header
7527        operations_help.insert(0, "Available calculation operations:")
7528
7529        # Return
7530        return operations_help
7531
7532    def calculation(
7533        self,
7534        operations: dict = {},
7535        operations_config_dict: dict = {},
7536        operations_config_file: str = None,
7537    ) -> None:
7538        """
7539        It takes a list of operations, and for each operation, it checks if it's a python or sql
7540        operation, and then calls the appropriate function
7541
7542        param json example:
7543            "calculation": {
7544                "NOMEN": {
7545                    "options": {
7546                        "hgvs_field": "hgvs"
7547                    },
7548                "middle" : null
7549            }
7550        """
7551
7552        # Param
7553        param = self.get_param()
7554
7555        # operations config
7556        operations_config = self.get_config_json(
7557            name="calculations",
7558            config_dict=operations_config_dict,
7559            config_file=operations_config_file,
7560        )
7561
7562        # Upper keys
7563        operations_config = {k.upper(): v for k, v in operations_config.items()}
7564
7565        # Calculations
7566
7567        # Operations from param
7568        operations = param.get("calculation", {}).get("calculations", operations)
7569
7570        # Quick calculation - add
7571        if param.get("calculations", None):
7572            calculations_list = [
7573                value for value in param.get("calculations", "").split(",")
7574            ]
7575            log.info(f"Quick Calculations:")
7576            for calculation_key in calculations_list:
7577                log.info(f"   {calculation_key}")
7578            for calculation_operation in calculations_list:
7579                if calculation_operation.upper() not in operations:
7580                    operations[calculation_operation.upper()] = {}
7581                    add_value_into_dict(
7582                        dict_tree=param,
7583                        sections=[
7584                            "calculation",
7585                            "calculations",
7586                            calculation_operation.upper(),
7587                        ],
7588                        value={},
7589                    )
7590
7591        # Operations for calculation
7592        if not operations:
7593            operations = param.get("calculation", {}).get("calculations", {})
7594
7595        if operations:
7596            log.info(f"Calculations...")
7597
7598        # For each operations
7599        for operation_name in operations:
7600            operation_name = operation_name.upper()
7601            if operation_name not in [""]:
7602                if operation_name in operations_config:
7603                    log.info(f"Calculation '{operation_name}'")
7604                    operation = operations_config[operation_name]
7605                    operation_type = operation.get("type", "sql")
7606                    if operation_type == "python":
7607                        self.calculation_process_function(
7608                            operation=operation, operation_name=operation_name
7609                        )
7610                    elif operation_type == "sql":
7611                        self.calculation_process_sql(
7612                            operation=operation, operation_name=operation_name
7613                        )
7614                    else:
7615                        log.error(
7616                            f"Operations config: Type '{operation_type}' NOT available"
7617                        )
7618                        raise ValueError(
7619                            f"Operations config: Type '{operation_type}' NOT available"
7620                        )
7621                else:
7622                    log.error(
7623                        f"Operations config: Calculation '{operation_name}' NOT available"
7624                    )
7625                    raise ValueError(
7626                        f"Operations config: Calculation '{operation_name}' NOT available"
7627                    )
7628
7629        # Explode INFOS fields into table fields
7630        if self.get_explode_infos():
7631            self.explode_infos(
7632                prefix=self.get_explode_infos_prefix(),
7633                fields=self.get_explode_infos_fields(),
7634                force=True,
7635            )
7636
7637    def calculation_process_sql(
7638        self, operation: dict, operation_name: str = "unknown"
7639    ) -> None:
7640        """
7641        The `calculation_process_sql` function takes in a mathematical operation as a string and
7642        performs the operation, updating the specified table with the result.
7643
7644        :param operation: The `operation` parameter is a dictionary that contains information about the
7645        mathematical operation to be performed. It includes the following keys:
7646        :type operation: dict
7647        :param operation_name: The `operation_name` parameter is a string that represents the name of
7648        the mathematical operation being performed. It is used for logging and error handling purposes,
7649        defaults to unknown
7650        :type operation_name: str (optional)
7651        """
7652
7653        # table variants
7654        table_variants = self.get_table_variants(clause="alter")
7655
7656        # Operation infos
7657        operation_name = operation.get("name", "unknown")
7658        log.debug(f"process sql {operation_name}")
7659        output_column_name = operation.get("output_column_name", operation_name)
7660        output_column_type = operation.get("output_column_type", "String")
7661        prefix = operation.get("explode_infos_prefix", "")
7662        output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR")
7663        output_column_description = operation.get(
7664            "output_column_description", f"{operation_name} operation"
7665        )
7666        operation_query = operation.get("operation_query", None)
7667        if isinstance(operation_query, list):
7668            operation_query = " ".join(operation_query)
7669        operation_info_fields = operation.get("info_fields", [])
7670        operation_info_fields_check = operation.get("info_fields_check", False)
7671        operation_info = operation.get("operation_info", True)
7672
7673        if operation_query:
7674
7675            # Info fields check
7676            operation_info_fields_check_result = True
7677            if operation_info_fields_check:
7678                header_infos = self.get_header().infos
7679                for info_field in operation_info_fields:
7680                    operation_info_fields_check_result = (
7681                        operation_info_fields_check_result
7682                        and info_field in header_infos
7683                    )
7684
7685            # If info fields available
7686            if operation_info_fields_check_result:
7687
7688                # Added_columns
7689                added_columns = []
7690
7691                # Create VCF header field
7692                vcf_reader = self.get_header()
7693                vcf_reader.infos[output_column_name] = vcf.parser._Info(
7694                    output_column_name,
7695                    ".",
7696                    output_column_type,
7697                    output_column_description,
7698                    "howard calculation",
7699                    "0",
7700                    self.code_type_map.get(output_column_type),
7701                )
7702
7703                # Explode infos if needed
7704                log.debug(f"calculation_process_sql prefix {prefix}")
7705                added_columns += self.explode_infos(
7706                    prefix=prefix,
7707                    fields=[output_column_name] + operation_info_fields,
7708                    force=True,
7709                )
7710
7711                # Create column
7712                added_column = self.add_column(
7713                    table_name=table_variants,
7714                    column_name=prefix + output_column_name,
7715                    column_type=output_column_type_sql,
7716                    default_value="null",
7717                )
7718                added_columns.append(added_column)
7719
7720                # Operation calculation
7721                try:
7722
7723                    # Query to update calculation column
7724                    sql_update = f"""
7725                        UPDATE {table_variants}
7726                        SET "{prefix}{output_column_name}" = ({operation_query})
7727                    """
7728                    self.conn.execute(sql_update)
7729
7730                    # Add to INFO
7731                    if operation_info:
7732                        sql_update_info = f"""
7733                            UPDATE {table_variants}
7734                            SET "INFO" =
7735                                concat(
7736                                    CASE
7737                                        WHEN "INFO" IS NOT NULL
7738                                        THEN concat("INFO", ';')
7739                                        ELSE ''
7740                                    END,
7741                                    '{output_column_name}=',
7742                                    "{prefix}{output_column_name}"
7743                                )
7744                            WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('')
7745                        """
7746                        self.conn.execute(sql_update_info)
7747
7748                except:
7749                    log.error(
7750                        f"Operations config: Calculation '{operation_name}' query failed"
7751                    )
7752                    raise ValueError(
7753                        f"Operations config: Calculation '{operation_name}' query failed"
7754                    )
7755
7756                # Remove added columns
7757                for added_column in added_columns:
7758                    log.debug(f"added_column: {added_column}")
7759                    self.drop_column(column=added_column)
7760
7761            else:
7762                log.error(
7763                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
7764                )
7765                raise ValueError(
7766                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
7767                )
7768
7769        else:
7770            log.error(
7771                f"Operations config: Calculation '{operation_name}' query NOT defined"
7772            )
7773            raise ValueError(
7774                f"Operations config: Calculation '{operation_name}' query NOT defined"
7775            )
7776
7777    def calculation_process_function(
7778        self, operation: dict, operation_name: str = "unknown"
7779    ) -> None:
7780        """
7781        The `calculation_process_function` takes in an operation dictionary and performs the specified
7782        function with the given parameters.
7783
7784        :param operation: The `operation` parameter is a dictionary that contains information about the
7785        operation to be performed. It has the following keys:
7786        :type operation: dict
7787        :param operation_name: The `operation_name` parameter is a string that represents the name of
7788        the operation being performed. It is used for logging purposes, defaults to unknown
7789        :type operation_name: str (optional)
7790        """
7791
7792        operation_name = operation["name"]
7793        log.debug(f"process sql {operation_name}")
7794        function_name = operation["function_name"]
7795        function_params = operation["function_params"]
7796        getattr(self, function_name)(*function_params)
7797
7798    def calculation_variant_id(self) -> None:
7799        """
7800        The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and
7801        updates the INFO field of a variants table with the variant ID.
7802        """
7803
7804        # variant_id annotation field
7805        variant_id_tag = self.get_variant_id_column()
7806        added_columns = [variant_id_tag]
7807
7808        # variant_id hgvs tags"
7809        vcf_infos_tags = {
7810            variant_id_tag: "howard variant ID annotation",
7811        }
7812
7813        # Variants table
7814        table_variants = self.get_table_variants()
7815
7816        # Header
7817        vcf_reader = self.get_header()
7818
7819        # Add variant_id to header
7820        vcf_reader.infos[variant_id_tag] = vcf.parser._Info(
7821            variant_id_tag,
7822            ".",
7823            "String",
7824            vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"),
7825            "howard calculation",
7826            "0",
7827            self.code_type_map.get("String"),
7828        )
7829
7830        # Update
7831        sql_update = f"""
7832            UPDATE {table_variants}
7833            SET "INFO" = 
7834                concat(
7835                    CASE
7836                        WHEN "INFO" IS NULL OR "INFO" IN ('','.')
7837                        THEN ''
7838                        ELSE concat("INFO", ';')
7839                    END,
7840                    '{variant_id_tag}=',
7841                    "{variant_id_tag}"
7842                )
7843        """
7844        self.conn.execute(sql_update)
7845
7846        # Remove added columns
7847        for added_column in added_columns:
7848            self.drop_column(column=added_column)
7849
7850    def calculation_extract_snpeff_hgvs(
7851        self,
7852        snpeff_hgvs: str = "snpeff_hgvs",
7853        snpeff_field: str = "ANN",
7854    ) -> None:
7855        """
7856        The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff
7857        annotation field in a VCF file and adds them as a new column in the variants table.
7858
7859        :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs`
7860        function is used to specify the name of the column that will store the HGVS nomenclatures
7861        extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to
7862        snpeff_hgvs
7863        :type snpeff_hgvs: str (optional)
7864        :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs`
7865        function represents the field in the VCF file that contains SnpEff annotations. This field is
7866        used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults
7867        to ANN
7868        :type snpeff_field: str (optional)
7869        """
7870
7871        # Snpeff hgvs tags
7872        vcf_infos_tags = {
7873            snpeff_hgvs: "HGVS nomenclatures from snpEff annotation",
7874        }
7875
7876        # Prefix
7877        prefix = self.get_explode_infos_prefix()
7878        if prefix:
7879            prefix = "INFO/"
7880
7881        # snpEff fields
7882        speff_ann_infos = prefix + snpeff_field
7883        speff_hgvs_infos = prefix + snpeff_hgvs
7884
7885        # Variants table
7886        table_variants = self.get_table_variants()
7887
7888        # Header
7889        vcf_reader = self.get_header()
7890
7891        # Add columns
7892        added_columns = []
7893
7894        # Explode HGVS field in column
7895        added_columns += self.explode_infos(fields=[snpeff_field])
7896
7897        if snpeff_field in vcf_reader.infos:
7898
7899            log.debug(vcf_reader.infos[snpeff_field])
7900
7901            # Extract ANN header
7902            ann_description = vcf_reader.infos[snpeff_field].desc
7903            pattern = r"'(.+?)'"
7904            match = re.search(pattern, ann_description)
7905            if match:
7906                ann_header_match = match.group(1).split(" | ")
7907                ann_header_desc = {}
7908                for i in range(len(ann_header_match)):
7909                    ann_header_info = "".join(
7910                        char for char in ann_header_match[i] if char.isalnum()
7911                    )
7912                    ann_header_desc[ann_header_info] = ann_header_match[i]
7913                if not ann_header_desc:
7914                    raise ValueError("Invalid header description format")
7915            else:
7916                raise ValueError("Invalid header description format")
7917
7918            # Create variant id
7919            variant_id_column = self.get_variant_id_column()
7920            added_columns += [variant_id_column]
7921
7922            # Create dataframe
7923            dataframe_snpeff_hgvs = self.get_query_to_df(
7924                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
7925            )
7926
7927            # Create main NOMEN column
7928            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
7929                speff_ann_infos
7930            ].apply(
7931                lambda x: extract_snpeff_hgvs(
7932                    str(x), header=list(ann_header_desc.values())
7933                )
7934            )
7935
7936            # Add snpeff_hgvs to header
7937            vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info(
7938                snpeff_hgvs,
7939                ".",
7940                "String",
7941                vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"),
7942                "howard calculation",
7943                "0",
7944                self.code_type_map.get("String"),
7945            )
7946
7947            # Update
7948            sql_update = f"""
7949                UPDATE variants
7950                SET "INFO" = 
7951                    concat(
7952                        CASE
7953                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
7954                            THEN ''
7955                            ELSE concat("INFO", ';')
7956                        END,
7957                        CASE 
7958                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
7959                            AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
7960                            THEN concat(
7961                                    '{snpeff_hgvs}=',
7962                                    dataframe_snpeff_hgvs."{speff_hgvs_infos}"
7963                                )
7964                            ELSE ''
7965                        END
7966                    )
7967                FROM dataframe_snpeff_hgvs
7968                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
7969
7970            """
7971            self.conn.execute(sql_update)
7972
7973            # Delete dataframe
7974            del dataframe_snpeff_hgvs
7975            gc.collect()
7976
7977        else:
7978
7979            log.warning(
7980                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
7981            )
7982
7983        # Remove added columns
7984        for added_column in added_columns:
7985            self.drop_column(column=added_column)
7986
7987    def calculation_snpeff_ann_explode(
7988        self,
7989        uniquify: bool = True,
7990        output_format: str = "fields",
7991        output_prefix: str = "snpeff_",
7992        snpeff_field: str = "ANN",
7993    ) -> None:
7994        """
7995        The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by
7996        exploding the HGVS field and updating variant information accordingly.
7997
7998        :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a
7999        boolean flag that determines whether the output should be uniquified or not. When set to `True`,
8000        it indicates that the output should be unique, meaning that duplicate entries should be removed,
8001        defaults to True
8002        :type uniquify: bool (optional)
8003        :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode`
8004        function specifies the format in which the output annotations will be generated. It has a
8005        default value of "fields". You can also set it to "JSON" to output the annotations in JSON
8006        format, defaults to fields
8007        :type output_format: str (optional)
8008        :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode`
8009        method is used to specify the prefix that will be added to the output annotations generated
8010        during the calculation process. This prefix helps to differentiate the newly added annotations
8011        from existing ones in the output data. By default, the, defaults to ANN_
8012        :type output_prefix: str (optional)
8013        :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode`
8014        function is used to specify the field in the VCF file that contains SnpEff annotations. This
8015        field will be processed to explode the HGVS annotations and update the variant information
8016        accordingly, defaults to ANN
8017        :type snpeff_field: str (optional)
8018        """
8019
8020        # SnpEff annotation field
8021        snpeff_hgvs = "snpeff_ann_explode"
8022
8023        # Snpeff hgvs tags
8024        vcf_infos_tags = {
8025            snpeff_hgvs: "Explode snpEff annotations",
8026        }
8027
8028        # Prefix
8029        prefix = self.get_explode_infos_prefix()
8030        if prefix:
8031            prefix = "INFO/"
8032
8033        # snpEff fields
8034        speff_ann_infos = prefix + snpeff_field
8035        speff_hgvs_infos = prefix + snpeff_hgvs
8036
8037        # Variants table
8038        table_variants = self.get_table_variants()
8039
8040        # Header
8041        vcf_reader = self.get_header()
8042
8043        # Add columns
8044        added_columns = []
8045
8046        # Explode HGVS field in column
8047        added_columns += self.explode_infos(fields=[snpeff_field])
8048        log.debug(f"snpeff_field={snpeff_field}")
8049        log.debug(f"added_columns={added_columns}")
8050
8051        if snpeff_field in vcf_reader.infos:
8052
8053            # Extract ANN header
8054            ann_description = vcf_reader.infos[snpeff_field].desc
8055            pattern = r"'(.+?)'"
8056            match = re.search(pattern, ann_description)
8057            if match:
8058                ann_header_match = match.group(1).split(" | ")
8059                ann_header = []
8060                ann_header_desc = {}
8061                for i in range(len(ann_header_match)):
8062                    ann_header_info = "".join(
8063                        char for char in ann_header_match[i] if char.isalnum()
8064                    )
8065                    ann_header.append(ann_header_info)
8066                    ann_header_desc[ann_header_info] = ann_header_match[i]
8067                if not ann_header_desc:
8068                    raise ValueError("Invalid header description format")
8069            else:
8070                raise ValueError("Invalid header description format")
8071
8072            # Create variant id
8073            variant_id_column = self.get_variant_id_column()
8074            added_columns += [variant_id_column]
8075
8076            # Create dataframe
8077            dataframe_snpeff_hgvs = self.get_query_to_df(
8078                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
8079            )
8080
8081            # Create snpEff columns
8082            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
8083                speff_ann_infos
8084            ].apply(
8085                lambda x: explode_snpeff_ann(
8086                    str(x),
8087                    uniquify=uniquify,
8088                    output_format=output_format,
8089                    prefix=output_prefix,
8090                    header=list(ann_header_desc.values()),
8091                )
8092            )
8093
8094            # Header
8095            ann_annotations_prefix = ""
8096            if output_format.upper() in ["JSON"]:
8097                ann_annotations_prefix = f"{output_prefix}="
8098                vcf_reader.infos[output_prefix] = vcf.parser._Info(
8099                    output_prefix,
8100                    ".",
8101                    "String",
8102                    vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
8103                    + " - JSON format",
8104                    "howard calculation",
8105                    "0",
8106                    self.code_type_map.get("String"),
8107                )
8108            else:
8109                for ann_annotation in ann_header:
8110                    ann_annotation_id = f"{output_prefix}{ann_annotation}"
8111                    vcf_reader.infos[ann_annotation_id] = vcf.parser._Info(
8112                        ann_annotation_id,
8113                        ".",
8114                        "String",
8115                        vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
8116                        + f" - '{ann_header_desc[ann_annotation]}' annotation",
8117                        "howard calculation",
8118                        "0",
8119                        self.code_type_map.get("String"),
8120                    )
8121
8122            # Update
8123            sql_update = f"""
8124                UPDATE variants
8125                SET "INFO" = 
8126                    concat(
8127                        CASE
8128                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8129                            THEN ''
8130                            ELSE concat("INFO", ';')
8131                        END,
8132                        CASE 
8133                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
8134                                AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
8135                            THEN concat(
8136                                '{ann_annotations_prefix}',
8137                                dataframe_snpeff_hgvs."{speff_hgvs_infos}"
8138                                )
8139                            ELSE ''
8140                        END
8141                    )
8142                FROM dataframe_snpeff_hgvs
8143                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
8144
8145            """
8146            self.conn.execute(sql_update)
8147
8148            # Delete dataframe
8149            del dataframe_snpeff_hgvs
8150            gc.collect()
8151
8152        else:
8153
8154            log.warning(
8155                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
8156            )
8157
8158        # Remove added columns
8159        for added_column in added_columns:
8160            self.drop_column(column=added_column)
8161
8162    def calculation_extract_nomen(self) -> None:
8163        """
8164        This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.
8165        """
8166
8167        # NOMEN field
8168        field_nomen_dict = "NOMEN_DICT"
8169
8170        # NOMEN structure
8171        nomen_dict = {
8172            "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)",
8173            "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)",
8174            "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)",
8175            "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant",
8176            "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)",
8177            "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)",
8178            "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)",
8179            "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)",
8180            "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)",
8181            "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)",
8182        }
8183
8184        # Param
8185        param = self.get_param()
8186
8187        # Prefix
8188        prefix = self.get_explode_infos_prefix()
8189
8190        # Header
8191        vcf_reader = self.get_header()
8192
8193        # Get HGVS field
8194        hgvs_field = (
8195            param.get("calculation", {})
8196            .get("calculations", {})
8197            .get("NOMEN", {})
8198            .get("options", {})
8199            .get("hgvs_field", "hgvs")
8200        )
8201
8202        # Get transcripts
8203        transcripts_file = (
8204            param.get("calculation", {})
8205            .get("calculations", {})
8206            .get("NOMEN", {})
8207            .get("options", {})
8208            .get("transcripts", None)
8209        )
8210        transcripts_file = full_path(transcripts_file)
8211        transcripts = []
8212        if transcripts_file:
8213            if os.path.exists(transcripts_file):
8214                transcripts_dataframe = transcripts_file_to_df(transcripts_file)
8215                transcripts = transcripts_dataframe.iloc[:, 0].tolist()
8216            else:
8217                log.error(f"Transcript file '{transcripts_file}' does NOT exist")
8218                raise ValueError(f"Transcript file '{transcripts_file}' does NOT exist")
8219
8220        # Added columns
8221        added_columns = []
8222
8223        # Explode HGVS field in column
8224        added_columns += self.explode_infos(fields=[hgvs_field])
8225
8226        # extra infos
8227        extra_infos = self.get_extra_infos()
8228        extra_field = prefix + hgvs_field
8229
8230        if extra_field in extra_infos:
8231
8232            # Create dataframe
8233            dataframe_hgvs = self.get_query_to_df(
8234                f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" FROM variants """
8235            )
8236
8237            # Create main NOMEN column
8238            dataframe_hgvs[field_nomen_dict] = dataframe_hgvs[extra_field].apply(
8239                lambda x: find_nomen(str(x), transcripts=transcripts)
8240            )
8241
8242            # Explode NOMEN Structure and create SQL set for update
8243            sql_nomen_fields = []
8244            for nomen_field in nomen_dict:
8245
8246                # Explode each field into a column
8247                dataframe_hgvs[nomen_field] = dataframe_hgvs[field_nomen_dict].apply(
8248                    lambda x: dict(x).get(nomen_field, "")
8249                )
8250
8251                # Create VCF header field
8252                vcf_reader.infos[nomen_field] = vcf.parser._Info(
8253                    nomen_field,
8254                    ".",
8255                    "String",
8256                    nomen_dict.get(nomen_field, "howard calculation NOMEN"),
8257                    "howard calculation",
8258                    "0",
8259                    self.code_type_map.get("String"),
8260                )
8261                sql_nomen_fields.append(
8262                    f"""
8263                        CASE 
8264                            WHEN dataframe_hgvs."{nomen_field}" NOT NULL AND dataframe_hgvs."{nomen_field}" NOT IN ('')
8265                            THEN concat(
8266                                    ';{nomen_field}=',
8267                                    dataframe_hgvs."{nomen_field}"
8268                                )
8269                            ELSE ''
8270                        END
8271                    """
8272                )
8273
8274            # SQL set for update
8275            sql_nomen_fields_set = ", ".join(sql_nomen_fields)
8276
8277            # Update
8278            sql_update = f"""
8279                UPDATE variants
8280                SET "INFO" = 
8281                    concat(
8282                        CASE
8283                            WHEN "INFO" IS NULL
8284                            THEN ''
8285                            ELSE "INFO"
8286                        END,
8287                        {sql_nomen_fields_set}
8288                    )
8289                FROM dataframe_hgvs
8290                WHERE variants."#CHROM" = dataframe_hgvs."#CHROM"
8291                    AND variants."POS" = dataframe_hgvs."POS" 
8292                    AND variants."REF" = dataframe_hgvs."REF"
8293                    AND variants."ALT" = dataframe_hgvs."ALT"
8294            """
8295            self.conn.execute(sql_update)
8296
8297            # Delete dataframe
8298            del dataframe_hgvs
8299            gc.collect()
8300
8301        # Remove added columns
8302        for added_column in added_columns:
8303            self.drop_column(column=added_column)
8304
8305    def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None:
8306        """
8307        The function `calculation_find_by_pipeline` performs a calculation to find the number of
8308        pipeline/sample for a variant and updates the variant information in a VCF file.
8309
8310        :param tag: The `tag` parameter is a string that represents the annotation field for the
8311        "findbypipeline" information in the VCF file. It is used to create the annotation field in the
8312        VCF header and to update the corresponding field in the variants table, defaults to
8313        findbypipeline
8314        :type tag: str (optional)
8315        """
8316
8317        # if FORMAT and samples
8318        if (
8319            "FORMAT" in self.get_header_columns_as_list()
8320            and self.get_header_sample_list()
8321        ):
8322
8323            # findbypipeline annotation field
8324            findbypipeline_tag = tag
8325
8326            # VCF infos tags
8327            vcf_infos_tags = {
8328                findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})",
8329            }
8330
8331            # Prefix
8332            prefix = self.get_explode_infos_prefix()
8333
8334            # Field
8335            findbypipeline_infos = prefix + findbypipeline_tag
8336
8337            # Variants table
8338            table_variants = self.get_table_variants()
8339
8340            # Header
8341            vcf_reader = self.get_header()
8342
8343            # Create variant id
8344            variant_id_column = self.get_variant_id_column()
8345            added_columns = [variant_id_column]
8346
8347            # variant_id, FORMAT and samples
8348            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8349                self.get_header_sample_list()
8350            )
8351
8352            # Create dataframe
8353            dataframe_findbypipeline = self.get_query_to_df(
8354                f""" SELECT {samples_fields} FROM {table_variants} """
8355            )
8356
8357            # Create findbypipeline column
8358            dataframe_findbypipeline[findbypipeline_infos] = (
8359                dataframe_findbypipeline.apply(
8360                    lambda row: findbypipeline(
8361                        row, samples=self.get_header_sample_list()
8362                    ),
8363                    axis=1,
8364                )
8365            )
8366
8367            # Add snpeff_hgvs to header
8368            vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info(
8369                findbypipeline_tag,
8370                ".",
8371                "String",
8372                vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"),
8373                "howard calculation",
8374                "0",
8375                self.code_type_map.get("String"),
8376            )
8377
8378            # Update
8379            sql_update = f"""
8380                UPDATE variants
8381                SET "INFO" = 
8382                    concat(
8383                        CASE
8384                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8385                            THEN ''
8386                            ELSE concat("INFO", ';')
8387                        END,
8388                        CASE 
8389                            WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.')
8390                                AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL
8391                            THEN concat(
8392                                    '{findbypipeline_tag}=',
8393                                    dataframe_findbypipeline."{findbypipeline_infos}"
8394                                )
8395                            ELSE ''
8396                        END
8397                    )
8398                FROM dataframe_findbypipeline
8399                WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}"
8400            """
8401            self.conn.execute(sql_update)
8402
8403            # Remove added columns
8404            for added_column in added_columns:
8405                self.drop_column(column=added_column)
8406
8407            # Delete dataframe
8408            del dataframe_findbypipeline
8409            gc.collect()
8410
8411    def calculation_genotype_concordance(self) -> None:
8412        """
8413        The function `calculation_genotype_concordance` calculates the genotype concordance for
8414        multi-caller VCF files and updates the variant information in the database.
8415        """
8416
8417        # if FORMAT and samples
8418        if (
8419            "FORMAT" in self.get_header_columns_as_list()
8420            and self.get_header_sample_list()
8421        ):
8422
8423            # genotypeconcordance annotation field
8424            genotypeconcordance_tag = "genotypeconcordance"
8425
8426            # VCF infos tags
8427            vcf_infos_tags = {
8428                genotypeconcordance_tag: "Concordance of genotype for multi caller VCF",
8429            }
8430
8431            # Prefix
8432            prefix = self.get_explode_infos_prefix()
8433
8434            # Field
8435            genotypeconcordance_infos = prefix + genotypeconcordance_tag
8436
8437            # Variants table
8438            table_variants = self.get_table_variants()
8439
8440            # Header
8441            vcf_reader = self.get_header()
8442
8443            # Create variant id
8444            variant_id_column = self.get_variant_id_column()
8445            added_columns = [variant_id_column]
8446
8447            # variant_id, FORMAT and samples
8448            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8449                self.get_header_sample_list()
8450            )
8451
8452            # Create dataframe
8453            dataframe_genotypeconcordance = self.get_query_to_df(
8454                f""" SELECT {samples_fields} FROM {table_variants} """
8455            )
8456
8457            # Create genotypeconcordance column
8458            dataframe_genotypeconcordance[genotypeconcordance_infos] = (
8459                dataframe_genotypeconcordance.apply(
8460                    lambda row: genotypeconcordance(
8461                        row, samples=self.get_header_sample_list()
8462                    ),
8463                    axis=1,
8464                )
8465            )
8466
8467            # Add genotypeconcordance to header
8468            vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info(
8469                genotypeconcordance_tag,
8470                ".",
8471                "String",
8472                vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"),
8473                "howard calculation",
8474                "0",
8475                self.code_type_map.get("String"),
8476            )
8477
8478            # Update
8479            sql_update = f"""
8480                UPDATE variants
8481                SET "INFO" = 
8482                    concat(
8483                        CASE
8484                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8485                            THEN ''
8486                            ELSE concat("INFO", ';')
8487                        END,
8488                        CASE
8489                            WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.')
8490                                AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL
8491                            THEN concat(
8492                                    '{genotypeconcordance_tag}=',
8493                                    dataframe_genotypeconcordance."{genotypeconcordance_infos}"
8494                                )
8495                            ELSE ''
8496                        END
8497                    )
8498                FROM dataframe_genotypeconcordance
8499                WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}"
8500            """
8501            self.conn.execute(sql_update)
8502
8503            # Remove added columns
8504            for added_column in added_columns:
8505                self.drop_column(column=added_column)
8506
8507            # Delete dataframe
8508            del dataframe_genotypeconcordance
8509            gc.collect()
8510
8511    def calculation_barcode(self, tag: str = "barcode") -> None:
8512        """
8513        The `calculation_barcode` function calculates barcode values for variants in a VCF file and
8514        updates the INFO field in the file with the calculated barcode values.
8515
8516        :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag
8517        name that will be used for the barcode calculation in the VCF file. If no tag name is provided,
8518        the default tag name is set to "barcode", defaults to barcode
8519        :type tag: str (optional)
8520        """
8521
8522        # if FORMAT and samples
8523        if (
8524            "FORMAT" in self.get_header_columns_as_list()
8525            and self.get_header_sample_list()
8526        ):
8527
8528            # barcode annotation field
8529            if not tag:
8530                tag = "barcode"
8531
8532            # VCF infos tags
8533            vcf_infos_tags = {
8534                tag: "barcode calculation (VaRank)",
8535            }
8536
8537            # Prefix
8538            prefix = self.get_explode_infos_prefix()
8539
8540            # Field
8541            barcode_infos = prefix + tag
8542
8543            # Variants table
8544            table_variants = self.get_table_variants()
8545
8546            # Header
8547            vcf_reader = self.get_header()
8548
8549            # Create variant id
8550            variant_id_column = self.get_variant_id_column()
8551            added_columns = [variant_id_column]
8552
8553            # variant_id, FORMAT and samples
8554            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8555                self.get_header_sample_list()
8556            )
8557
8558            # Create dataframe
8559            dataframe_barcode = self.get_query_to_df(
8560                f""" SELECT {samples_fields} FROM {table_variants} """
8561            )
8562
8563            # Create barcode column
8564            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
8565                lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1
8566            )
8567
8568            # Add barcode to header
8569            vcf_reader.infos[tag] = vcf.parser._Info(
8570                tag,
8571                ".",
8572                "String",
8573                vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)),
8574                "howard calculation",
8575                "0",
8576                self.code_type_map.get("String"),
8577            )
8578
8579            # Update
8580            sql_update = f"""
8581                UPDATE {table_variants}
8582                SET "INFO" = 
8583                    concat(
8584                        CASE
8585                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8586                            THEN ''
8587                            ELSE concat("INFO", ';')
8588                        END,
8589                        CASE
8590                            WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.')
8591                            AND dataframe_barcode."{barcode_infos}" NOT NULL
8592                            THEN concat(
8593                                    '{tag}=',
8594                                    dataframe_barcode."{barcode_infos}"
8595                                )
8596                            ELSE ''
8597                        END
8598                    )
8599                FROM dataframe_barcode
8600                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
8601            """
8602            self.conn.execute(sql_update)
8603
8604            # Remove added columns
8605            for added_column in added_columns:
8606                self.drop_column(column=added_column)
8607
8608            # Delete dataframe
8609            del dataframe_barcode
8610            gc.collect()
8611
8612    def calculation_barcode_family(self, tag: str = "BCF") -> None:
8613        """
8614        The `calculation_barcode_family` function calculates barcode values for variants in a VCF file
8615        and updates the INFO field in the file with the calculated barcode values.
8616
8617        :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify
8618        the barcode tag that will be added to the VCF file during the calculation process. If no value
8619        is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF
8620        :type tag: str (optional)
8621        """
8622
8623        # if FORMAT and samples
8624        if (
8625            "FORMAT" in self.get_header_columns_as_list()
8626            and self.get_header_sample_list()
8627        ):
8628
8629            # barcode annotation field
8630            if not tag:
8631                tag = "BCF"
8632
8633            # VCF infos tags
8634            vcf_infos_tags = {
8635                tag: "barcode family calculation",
8636                f"{tag}S": "barcode family samples",
8637            }
8638
8639            # Param
8640            param = self.get_param()
8641            log.debug(f"param={param}")
8642
8643            # Prefix
8644            prefix = self.get_explode_infos_prefix()
8645
8646            # PED param
8647            ped = (
8648                param.get("calculation", {})
8649                .get("calculations", {})
8650                .get("BARCODEFAMILY", {})
8651                .get("family_pedigree", None)
8652            )
8653            log.debug(f"ped={ped}")
8654
8655            # Load PED
8656            if ped:
8657
8658                # Pedigree is a file
8659                if isinstance(ped, str) and os.path.exists(full_path(ped)):
8660                    log.debug("Pedigree is file")
8661                    with open(full_path(ped)) as ped:
8662                        ped = json.load(ped)
8663
8664                # Pedigree is a string
8665                elif isinstance(ped, str):
8666                    log.debug("Pedigree is str")
8667                    try:
8668                        ped = json.loads(ped)
8669                        log.debug("Pedigree is json str")
8670                    except ValueError as e:
8671                        ped_samples = ped.split(",")
8672                        ped = {}
8673                        for ped_sample in ped_samples:
8674                            ped[ped_sample] = ped_sample
8675
8676                # Pedigree is a dict
8677                elif isinstance(ped, dict):
8678                    log.debug("Pedigree is dict")
8679
8680                # Pedigree is not well formatted
8681                else:
8682                    msg_error = "Pedigree not well formatted"
8683                    log.error(msg_error)
8684                    raise ValueError(msg_error)
8685
8686                # Construct list
8687                ped_samples = list(ped.values())
8688
8689            else:
8690                log.debug("Pedigree not defined. Take all samples")
8691                ped_samples = self.get_header_sample_list()
8692                ped = {}
8693                for ped_sample in ped_samples:
8694                    ped[ped_sample] = ped_sample
8695
8696            # Check pedigree
8697            if not ped or len(ped) == 0:
8698                msg_error = f"Error in pedigree: samples {ped_samples}"
8699                log.error(msg_error)
8700                raise ValueError(msg_error)
8701
8702            # Log
8703            log.info(
8704                "Calculation 'BARCODEFAMILY' - Samples: "
8705                + ", ".join([f"{member}='{ped[member]}'" for member in ped])
8706            )
8707            log.debug(f"ped_samples={ped_samples}")
8708
8709            # Field
8710            barcode_infos = prefix + tag
8711
8712            # Variants table
8713            table_variants = self.get_table_variants()
8714
8715            # Header
8716            vcf_reader = self.get_header()
8717
8718            # Create variant id
8719            variant_id_column = self.get_variant_id_column()
8720            added_columns = [variant_id_column]
8721
8722            # variant_id, FORMAT and samples
8723            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8724                ped_samples
8725            )
8726
8727            # Create dataframe
8728            dataframe_barcode = self.get_query_to_df(
8729                f""" SELECT {samples_fields} FROM {table_variants} """
8730            )
8731
8732            # Create barcode column
8733            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
8734                lambda row: barcode(row, samples=ped_samples), axis=1
8735            )
8736
8737            # Add barcode family to header
8738            # Add vaf_normalization to header
8739            vcf_reader.formats[tag] = vcf.parser._Format(
8740                id=tag,
8741                num=".",
8742                type="String",
8743                desc=vcf_infos_tags.get(tag, "barcode family calculation"),
8744                type_code=self.code_type_map.get("String"),
8745            )
8746            vcf_reader.formats[f"{tag}S"] = vcf.parser._Format(
8747                id=f"{tag}S",
8748                num=".",
8749                type="String",
8750                desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"),
8751                type_code=self.code_type_map.get("String"),
8752            )
8753
8754            # Update
8755            # for sample in ped_samples:
8756            sql_update_set = []
8757            for sample in self.get_header_sample_list() + ["FORMAT"]:
8758                if sample in ped_samples:
8759                    value = f'dataframe_barcode."{barcode_infos}"'
8760                    value_samples = "'" + ",".join(ped_samples) + "'"
8761                elif sample == "FORMAT":
8762                    value = f"'{tag}'"
8763                    value_samples = f"'{tag}S'"
8764                else:
8765                    value = "'.'"
8766                    value_samples = "'.'"
8767                format_regex = r"[a-zA-Z0-9\s]"
8768                sql_update_set.append(
8769                    f"""
8770                        "{sample}" = 
8771                        concat(
8772                            CASE
8773                                WHEN {table_variants}."{sample}" = './.'
8774                                THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g'))
8775                                ELSE {table_variants}."{sample}"
8776                            END,
8777                            ':',
8778                            {value},
8779                            ':',
8780                            {value_samples}
8781                        )
8782                    """
8783                )
8784
8785            sql_update_set_join = ", ".join(sql_update_set)
8786            sql_update = f"""
8787                UPDATE {table_variants}
8788                SET {sql_update_set_join}
8789                FROM dataframe_barcode
8790                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
8791            """
8792            self.conn.execute(sql_update)
8793
8794            # Remove added columns
8795            for added_column in added_columns:
8796                self.drop_column(column=added_column)
8797
8798            # Delete dataframe
8799            del dataframe_barcode
8800            gc.collect()
8801
8802    def calculation_trio(self) -> None:
8803        """
8804        The `calculation_trio` function performs trio calculations on a VCF file by adding trio
8805        information to the INFO field of each variant.
8806        """
8807
8808        # if FORMAT and samples
8809        if (
8810            "FORMAT" in self.get_header_columns_as_list()
8811            and self.get_header_sample_list()
8812        ):
8813
8814            # trio annotation field
8815            trio_tag = "trio"
8816
8817            # VCF infos tags
8818            vcf_infos_tags = {
8819                "trio": "trio calculation",
8820            }
8821
8822            # Param
8823            param = self.get_param()
8824
8825            # Prefix
8826            prefix = self.get_explode_infos_prefix()
8827
8828            # Trio param
8829            trio_ped = (
8830                param.get("calculation", {})
8831                .get("calculations", {})
8832                .get("TRIO", {})
8833                .get("trio_pedigree", None)
8834            )
8835
8836            # Load trio
8837            if trio_ped:
8838
8839                # Trio pedigree is a file
8840                if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)):
8841                    log.debug("TRIO pedigree is file")
8842                    with open(full_path(trio_ped)) as trio_ped:
8843                        trio_ped = json.load(trio_ped)
8844
8845                # Trio pedigree is a string
8846                elif isinstance(trio_ped, str):
8847                    log.debug("TRIO pedigree is str")
8848                    try:
8849                        trio_ped = json.loads(trio_ped)
8850                        log.debug("TRIO pedigree is json str")
8851                    except ValueError as e:
8852                        trio_samples = trio_ped.split(",")
8853                        if len(trio_samples) == 3:
8854                            trio_ped = {
8855                                "father": trio_samples[0],
8856                                "mother": trio_samples[1],
8857                                "child": trio_samples[2],
8858                            }
8859                            log.debug("TRIO pedigree is list str")
8860                        else:
8861                            msg_error = "TRIO pedigree not well formatted"
8862                            log.error(msg_error)
8863                            raise ValueError(msg_error)
8864
8865                # Trio pedigree is a dict
8866                elif isinstance(trio_ped, dict):
8867                    log.debug("TRIO pedigree is dict")
8868
8869                # Trio pedigree is not well formatted
8870                else:
8871                    msg_error = "TRIO pedigree not well formatted"
8872                    log.error(msg_error)
8873                    raise ValueError(msg_error)
8874
8875                # Construct trio list
8876                trio_samples = [
8877                    trio_ped.get("father", ""),
8878                    trio_ped.get("mother", ""),
8879                    trio_ped.get("child", ""),
8880                ]
8881
8882            else:
8883                log.debug("TRIO pedigree not defined. Take the first 3 samples")
8884                samples_list = self.get_header_sample_list()
8885                if len(samples_list) >= 3:
8886                    trio_samples = self.get_header_sample_list()[0:3]
8887                    trio_ped = {
8888                        "father": trio_samples[0],
8889                        "mother": trio_samples[1],
8890                        "child": trio_samples[2],
8891                    }
8892                else:
8893                    msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}"
8894                    log.error(msg_error)
8895                    raise ValueError(msg_error)
8896
8897            # Check trio pedigree
8898            if not trio_ped or len(trio_ped) != 3:
8899                msg_error = f"Error in TRIO pedigree: {trio_ped}"
8900                log.error(msg_error)
8901                raise ValueError(msg_error)
8902
8903            # Log
8904            log.info(
8905                f"Calculation 'TRIO' - Samples: "
8906                + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped])
8907            )
8908
8909            # Field
8910            trio_infos = prefix + trio_tag
8911
8912            # Variants table
8913            table_variants = self.get_table_variants()
8914
8915            # Header
8916            vcf_reader = self.get_header()
8917
8918            # Create variant id
8919            variant_id_column = self.get_variant_id_column()
8920            added_columns = [variant_id_column]
8921
8922            # variant_id, FORMAT and samples
8923            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8924                self.get_header_sample_list()
8925            )
8926
8927            # Create dataframe
8928            dataframe_trio = self.get_query_to_df(
8929                f""" SELECT {samples_fields} FROM {table_variants} """
8930            )
8931
8932            # Create trio column
8933            dataframe_trio[trio_infos] = dataframe_trio.apply(
8934                lambda row: trio(row, samples=trio_samples), axis=1
8935            )
8936
8937            # Add trio to header
8938            vcf_reader.infos[trio_tag] = vcf.parser._Info(
8939                trio_tag,
8940                ".",
8941                "String",
8942                vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"),
8943                "howard calculation",
8944                "0",
8945                self.code_type_map.get("String"),
8946            )
8947
8948            # Update
8949            sql_update = f"""
8950                UPDATE {table_variants}
8951                SET "INFO" = 
8952                    concat(
8953                        CASE
8954                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8955                            THEN ''
8956                            ELSE concat("INFO", ';')
8957                        END,
8958                        CASE
8959                            WHEN dataframe_trio."{trio_infos}" NOT IN ('','.')
8960                             AND dataframe_trio."{trio_infos}" NOT NULL
8961                            THEN concat(
8962                                    '{trio_tag}=',
8963                                    dataframe_trio."{trio_infos}"
8964                                )
8965                            ELSE ''
8966                        END
8967                    )
8968                FROM dataframe_trio
8969                WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}"
8970            """
8971            self.conn.execute(sql_update)
8972
8973            # Remove added columns
8974            for added_column in added_columns:
8975                self.drop_column(column=added_column)
8976
8977            # Delete dataframe
8978            del dataframe_trio
8979            gc.collect()
8980
8981    def calculation_vaf_normalization(self) -> None:
8982        """
8983        The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency)
8984        normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.
8985        :return: The function does not return anything.
8986        """
8987
8988        # if FORMAT and samples
8989        if (
8990            "FORMAT" in self.get_header_columns_as_list()
8991            and self.get_header_sample_list()
8992        ):
8993
8994            # vaf_normalization annotation field
8995            vaf_normalization_tag = "VAF"
8996
8997            # VCF infos tags
8998            vcf_infos_tags = {
8999                "VAF": "VAF Variant Frequency",
9000            }
9001
9002            # Prefix
9003            prefix = self.get_explode_infos_prefix()
9004
9005            # Variants table
9006            table_variants = self.get_table_variants()
9007
9008            # Header
9009            vcf_reader = self.get_header()
9010
9011            # Do not calculate if VAF already exists
9012            if "VAF" in vcf_reader.formats:
9013                log.debug("VAF already on genotypes")
9014                return
9015
9016            # Create variant id
9017            variant_id_column = self.get_variant_id_column()
9018            added_columns = [variant_id_column]
9019
9020            # variant_id, FORMAT and samples
9021            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9022                f""" "{sample}" """ for sample in self.get_header_sample_list()
9023            )
9024
9025            # Create dataframe
9026            query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """
9027            log.debug(f"query={query}")
9028            dataframe_vaf_normalization = self.get_query_to_df(query=query)
9029
9030            vaf_normalization_set = []
9031
9032            # for each sample vaf_normalization
9033            for sample in self.get_header_sample_list():
9034                dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply(
9035                    lambda row: vaf_normalization(row, sample=sample), axis=1
9036                )
9037                vaf_normalization_set.append(
9038                    f""" "{sample}" = dataframe_vaf_normalization."{sample}" """
9039                )
9040
9041            # Add VAF to FORMAT
9042            dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[
9043                "FORMAT"
9044            ].apply(lambda x: str(x) + ":VAF")
9045            vaf_normalization_set.append(
9046                f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """
9047            )
9048
9049            # Add vaf_normalization to header
9050            vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format(
9051                id=vaf_normalization_tag,
9052                num="1",
9053                type="Float",
9054                desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"),
9055                type_code=self.code_type_map.get("Float"),
9056            )
9057
9058            # Create fields to add in INFO
9059            sql_vaf_normalization_set = " , ".join(vaf_normalization_set)
9060
9061            # Update
9062            sql_update = f"""
9063                UPDATE {table_variants}
9064                SET {sql_vaf_normalization_set}
9065                FROM dataframe_vaf_normalization
9066                WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}"
9067
9068            """
9069            self.conn.execute(sql_update)
9070
9071            # Remove added columns
9072            for added_column in added_columns:
9073                self.drop_column(column=added_column)
9074
9075            # Delete dataframe
9076            del dataframe_vaf_normalization
9077            gc.collect()
9078
9079    def calculation_genotype_stats(self, info: str = "VAF") -> None:
9080        """
9081        The `calculation_genotype_stats` function calculates genotype statistics for a given information
9082        field in a VCF file and updates the INFO column of the variants table with the calculated
9083        statistics.
9084
9085        :param info: The `info` parameter is a string that represents the type of information for which
9086        genotype statistics are calculated. It is used to generate various VCF info tags for the
9087        statistics, such as the number of occurrences, the list of values, the minimum value, the
9088        maximum value, the mean, the median, defaults to VAF
9089        :type info: str (optional)
9090        """
9091
9092        # if FORMAT and samples
9093        if (
9094            "FORMAT" in self.get_header_columns_as_list()
9095            and self.get_header_sample_list()
9096        ):
9097
9098            # vaf_stats annotation field
9099            vaf_stats_tag = info + "_stats"
9100
9101            # VCF infos tags
9102            vcf_infos_tags = {
9103                info + "_stats_nb": f"genotype {info} Statistics - number of {info}",
9104                info + "_stats_list": f"genotype {info} Statistics - list of {info}",
9105                info + "_stats_min": f"genotype {info} Statistics - min {info}",
9106                info + "_stats_max": f"genotype {info} Statistics - max {info}",
9107                info + "_stats_mean": f"genotype {info} Statistics - mean {info}",
9108                info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}",
9109                info
9110                + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}",
9111            }
9112
9113            # Prefix
9114            prefix = self.get_explode_infos_prefix()
9115
9116            # Field
9117            vaf_stats_infos = prefix + vaf_stats_tag
9118
9119            # Variants table
9120            table_variants = self.get_table_variants()
9121
9122            # Header
9123            vcf_reader = self.get_header()
9124
9125            # Create variant id
9126            variant_id_column = self.get_variant_id_column()
9127            added_columns = [variant_id_column]
9128
9129            # variant_id, FORMAT and samples
9130            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9131                self.get_header_sample_list()
9132            )
9133
9134            # Create dataframe
9135            dataframe_vaf_stats = self.get_query_to_df(
9136                f""" SELECT {samples_fields} FROM {table_variants} """
9137            )
9138
9139            # Create vaf_stats column
9140            dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply(
9141                lambda row: genotype_stats(
9142                    row, samples=self.get_header_sample_list(), info=info
9143                ),
9144                axis=1,
9145            )
9146
9147            # List of vcf tags
9148            sql_vaf_stats_fields = []
9149
9150            # Check all VAF stats infos
9151            for stat in vcf_infos_tags:
9152
9153                # Extract stats
9154                dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply(
9155                    lambda x: dict(x).get(stat, "")
9156                )
9157
9158                # Add snpeff_hgvs to header
9159                vcf_reader.infos[stat] = vcf.parser._Info(
9160                    stat,
9161                    ".",
9162                    "String",
9163                    vcf_infos_tags.get(stat, "genotype statistics"),
9164                    "howard calculation",
9165                    "0",
9166                    self.code_type_map.get("String"),
9167                )
9168
9169                if len(sql_vaf_stats_fields):
9170                    sep = ";"
9171                else:
9172                    sep = ""
9173
9174                # Create fields to add in INFO
9175                sql_vaf_stats_fields.append(
9176                    f"""
9177                        CASE
9178                            WHEN dataframe_vaf_stats."{stat}" NOT NULL
9179                            THEN concat(
9180                                    '{sep}{stat}=',
9181                                    dataframe_vaf_stats."{stat}"
9182                                )
9183                            ELSE ''
9184                        END
9185                    """
9186                )
9187
9188            # SQL set for update
9189            sql_vaf_stats_fields_set = ",  ".join(sql_vaf_stats_fields)
9190
9191            # Update
9192            sql_update = f"""
9193                UPDATE {table_variants}
9194                SET "INFO" = 
9195                    concat(
9196                        CASE
9197                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
9198                            THEN ''
9199                            ELSE concat("INFO", ';')
9200                        END,
9201                        {sql_vaf_stats_fields_set}
9202                    )
9203                FROM dataframe_vaf_stats
9204                WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}"
9205
9206            """
9207            self.conn.execute(sql_update)
9208
9209            # Remove added columns
9210            for added_column in added_columns:
9211                self.drop_column(column=added_column)
9212
9213            # Delete dataframe
9214            del dataframe_vaf_stats
9215            gc.collect()
9216
9217    def calculation_transcripts_json(self, info: str = "transcripts_json") -> None:
9218        """
9219        The function `calculation_transcripts_json` creates a transcripts table and adds an info field
9220        to it if transcripts are available.
9221
9222        :param info: The `info` parameter in the `calculation_transcripts_json` method is a string
9223        parameter that specifies the information field to be used in the transcripts JSON. It has a
9224        default value of "transcripts_json" if no value is provided when calling the method, defaults to
9225        transcripts_json
9226        :type info: str (optional)
9227        """
9228
9229        # Create transcripts table
9230        transcripts_table = self.create_transcript_view()
9231
9232        # Add info field
9233        if transcripts_table:
9234            self.transcript_view_to_variants(
9235                transcripts_table=transcripts_table, transcripts_info_field=info
9236            )
9237        else:
9238            log.info("No Transcripts to process. Check param.json file configuration")
9239
9240    ###############
9241    # Transcripts #
9242    ###############
9243
9244    def create_transcript_view_from_columns_map(
9245        self,
9246        transcripts_table: str = "transcripts",
9247        columns_maps: dict = {},
9248        added_columns: list = [],
9249        temporary_tables: list = None,
9250        annotation_fields: list = None,
9251    ) -> tuple[list, list, list]:
9252        """
9253        The `create_transcript_view_from_columns_map` function generates a temporary table view based on
9254        specified columns mapping for transcripts data.
9255
9256        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of
9257        the table where the transcripts data is stored or will be stored in the database. This table
9258        typically contains information about transcripts such as Ensembl transcript IDs, gene names, scores,
9259        predictions, etc. It defaults to "transcripts, defaults to transcripts
9260        :type transcripts_table: str (optional)
9261        :param columns_maps: The `columns_maps` parameter is a dictionary that contains information about
9262        how to map columns from a transcripts table to create a view. Each entry in the `columns_maps` list
9263        represents a mapping configuration for a specific set of columns. It typically includes details such
9264        as the main transcript column and additional information columns
9265        :type columns_maps: dict
9266        :param added_columns: The `added_columns` parameter in the `create_transcript_view_from_columns_map`
9267        function is a list that stores the additional columns that will be added to the view being created
9268        based on the columns map provided. These columns are generated by exploding the transcript
9269        information columns along with the main transcript column
9270        :type added_columns: list
9271        :param temporary_tables: The `temporary_tables` parameter in the
9272        `create_transcript_view_from_columns_map` function is a list that stores the names of temporary
9273        tables created during the process of creating a transcript view from a columns map. These temporary
9274        tables are used to store intermediate results or transformations before the final view is generated
9275        :type temporary_tables: list
9276        :param annotation_fields: The `annotation_fields` parameter in the
9277        `create_transcript_view_from_columns_map` function is a list that stores the fields that are used
9278        for annotation in the query view creation process. These fields are extracted from the
9279        `transcripts_column` and `transcripts_infos_columns` specified in the `columns
9280        :type annotation_fields: list
9281        :return: The function `create_transcript_view_from_columns_map` returns a tuple containing three
9282        lists: `added_columns`, `temporary_tables`, and `annotation_fields`.
9283        """
9284
9285        log.debug("Start transcrpts view creation from columns map...")
9286
9287        # "from_columns_map": [
9288        #     {
9289        #         "transcripts_column": "Ensembl_transcriptid",
9290        #         "transcripts_infos_columns": [
9291        #             "genename",
9292        #             "Ensembl_geneid",
9293        #             "LIST_S2_score",
9294        #             "LIST_S2_pred",
9295        #         ],
9296        #     },
9297        #     {
9298        #         "transcripts_column": "Ensembl_transcriptid",
9299        #         "transcripts_infos_columns": [
9300        #             "genename",
9301        #             "VARITY_R_score",
9302        #             "Aloft_pred",
9303        #         ],
9304        #     },
9305        # ],
9306
9307        # Init
9308        if temporary_tables is None:
9309            temporary_tables = []
9310        if annotation_fields is None:
9311            annotation_fields = []
9312
9313        # Variants table
9314        table_variants = self.get_table_variants()
9315
9316        for columns_map in columns_maps:
9317
9318            # Transcript column
9319            transcripts_column = columns_map.get("transcripts_column", None)
9320
9321            # Transcripts infos columns
9322            transcripts_infos_columns = columns_map.get("transcripts_infos_columns", [])
9323
9324            if transcripts_column is not None:
9325
9326                # Explode
9327                added_columns += self.explode_infos(
9328                    fields=[transcripts_column] + transcripts_infos_columns
9329                )
9330
9331                # View clauses
9332                clause_select = []
9333                for field in [transcripts_column] + transcripts_infos_columns:
9334                    clause_select.append(
9335                        f""" regexp_split_to_table("{field}", ',') AS '{field}' """
9336                    )
9337                    if field not in [transcripts_column]:
9338                        annotation_fields.append(field)
9339
9340                # Querey View
9341                query = f""" 
9342                    SELECT
9343                        "#CHROM", POS, REF, ALT,
9344                        "{transcripts_column}" AS 'transcript',
9345                        {", ".join(clause_select)}
9346                    FROM (
9347                        SELECT 
9348                            "#CHROM", POS, REF, ALT,
9349                            {", ".join(clause_select)}
9350                        FROM {table_variants}
9351                        )
9352                    WHERE "{transcripts_column}" IS NOT NULL
9353                """
9354
9355                # Create temporary table
9356                temporary_table = transcripts_table + "".join(
9357                    random.choices(string.ascii_uppercase + string.digits, k=10)
9358                )
9359
9360                # Temporary_tables
9361                temporary_tables.append(temporary_table)
9362                query_view = f"""
9363                    CREATE TEMPORARY TABLE {temporary_table}
9364                    AS ({query})
9365                """
9366                self.execute_query(query=query_view)
9367
9368        return added_columns, temporary_tables, annotation_fields
9369
9370    def create_transcript_view_from_column_format(
9371        self,
9372        transcripts_table: str = "transcripts",
9373        column_formats: dict = {},
9374        temporary_tables: list = None,
9375        annotation_fields: list = None,
9376    ) -> tuple[list, list, list]:
9377        """
9378        The `create_transcript_view_from_column_format` function generates a transcript view based on
9379        specified column formats, adds additional columns and annotation fields, and returns the list of
9380        temporary tables and annotation fields.
9381
9382        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of
9383        the table containing the transcripts data. This table will be used as the base table for creating
9384        the transcript view. The default value for this parameter is "transcripts", but you can provide a
9385        different table name if needed, defaults to transcripts
9386        :type transcripts_table: str (optional)
9387        :param column_formats: The `column_formats` parameter is a dictionary that contains information
9388        about the columns to be used for creating the transcript view. Each entry in the dictionary
9389        specifies the mapping between a transcripts column and a transcripts infos column. For example, in
9390        the provided code snippet:
9391        :type column_formats: dict
9392        :param temporary_tables: The `temporary_tables` parameter in the
9393        `create_transcript_view_from_column_format` function is a list that stores the names of temporary
9394        views created during the process of creating a transcript view from a column format. These temporary
9395        views are used to manipulate and extract data before generating the final transcript view. It
9396        :type temporary_tables: list
9397        :param annotation_fields: The `annotation_fields` parameter in the
9398        `create_transcript_view_from_column_format` function is a list that stores the annotation fields
9399        that are extracted from the temporary views created during the process. These annotation fields are
9400        obtained by querying the temporary views and extracting the column names excluding specific columns
9401        like `#CH
9402        :type annotation_fields: list
9403        :return: The `create_transcript_view_from_column_format` function returns two lists:
9404        `temporary_tables` and `annotation_fields`.
9405        """
9406
9407        log.debug("Start transcrpts view creation from column format...")
9408
9409        #  "from_column_format": [
9410        #     {
9411        #         "transcripts_column": "ANN",
9412        #         "transcripts_infos_column": "Feature_ID",
9413        #     }
9414        # ],
9415
9416        # Init
9417        if temporary_tables is None:
9418            temporary_tables = []
9419        if annotation_fields is None:
9420            annotation_fields = []
9421
9422        for column_format in column_formats:
9423
9424            # annotation field and transcript annotation field
9425            annotation_field = column_format.get("transcripts_column", "ANN")
9426            transcript_annotation = column_format.get(
9427                "transcripts_infos_column", "Feature_ID"
9428            )
9429
9430            # Temporary View name
9431            temporary_view_name = transcripts_table + "".join(
9432                random.choices(string.ascii_uppercase + string.digits, k=10)
9433            )
9434
9435            # Create temporary view name
9436            temporary_view_name = self.annotation_format_to_table(
9437                uniquify=True,
9438                annotation_field=annotation_field,
9439                view_name=temporary_view_name,
9440                annotation_id=transcript_annotation,
9441            )
9442
9443            # Annotation fields
9444            if temporary_view_name:
9445                query_annotation_fields = f"""
9446                    SELECT *
9447                    FROM (
9448                        DESCRIBE SELECT *
9449                        FROM {temporary_view_name}
9450                        )
9451                        WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT')
9452                """
9453                df_annotation_fields = self.get_query_to_df(
9454                    query=query_annotation_fields
9455                )
9456
9457                # Add temporary view and annotation fields
9458                temporary_tables.append(temporary_view_name)
9459                annotation_fields += list(set(df_annotation_fields["column_name"]))
9460
9461        return temporary_tables, annotation_fields
9462
9463    def create_transcript_view(
9464        self,
9465        transcripts_table: str = None,
9466        transcripts_table_drop: bool = True,
9467        param: dict = {},
9468    ) -> str:
9469        """
9470        The `create_transcript_view` function generates a transcript view by processing data from a
9471        specified table based on provided parameters and structural information.
9472
9473        :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function
9474        is used to specify the name of the table that will store the final transcript view data. If a table
9475        name is not provided, the function will create a new table to store the transcript view data, and by
9476        default,, defaults to transcripts
9477        :type transcripts_table: str (optional)
9478        :param transcripts_table_drop: The `transcripts_table_drop` parameter in the
9479        `create_transcript_view` function is a boolean parameter that determines whether to drop the
9480        existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`,
9481        the function will drop the existing transcripts table if it exists, defaults to True
9482        :type transcripts_table_drop: bool (optional)
9483        :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that
9484        contains information needed to create a transcript view. It includes details such as the structure
9485        of the transcripts, columns mapping, column formats, and other necessary information for generating
9486        the view. This parameter allows for flexibility and customization
9487        :type param: dict
9488        :return: The `create_transcript_view` function returns the name of the transcripts table that was
9489        created or modified during the execution of the function.
9490        """
9491
9492        log.debug("Start transcrpts view creation...")
9493
9494        # Default
9495        transcripts_table_default = "transcripts"
9496
9497        # Param
9498        if not param:
9499            param = self.get_param()
9500
9501        # Struct
9502        struct = param.get("transcripts", {}).get("struct", None)
9503
9504        if struct:
9505
9506            # Transcripts table
9507            if transcripts_table is None:
9508                transcripts_table = param.get("transcripts", {}).get(
9509                    "table", transcripts_table_default
9510                )
9511
9512            # added_columns
9513            added_columns = []
9514
9515            # Temporary tables
9516            temporary_tables = []
9517
9518            # Annotation fields
9519            annotation_fields = []
9520
9521            # from columns map
9522            columns_maps = struct.get("from_columns_map", [])
9523            added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = (
9524                self.create_transcript_view_from_columns_map(
9525                    transcripts_table=transcripts_table,
9526                    columns_maps=columns_maps,
9527                    added_columns=added_columns,
9528                    temporary_tables=temporary_tables,
9529                    annotation_fields=annotation_fields,
9530                )
9531            )
9532            added_columns += added_columns_tmp
9533            temporary_tables += temporary_tables_tmp
9534            annotation_fields += annotation_fields_tmp
9535
9536            # from column format
9537            column_formats = struct.get("from_column_format", [])
9538            temporary_tables_tmp, annotation_fields_tmp = (
9539                self.create_transcript_view_from_column_format(
9540                    transcripts_table=transcripts_table,
9541                    column_formats=column_formats,
9542                    temporary_tables=temporary_tables,
9543                    annotation_fields=annotation_fields,
9544                )
9545            )
9546            temporary_tables += temporary_tables_tmp
9547            annotation_fields += annotation_fields_tmp
9548
9549            # Merge temporary tables query
9550            query_merge = ""
9551            for temporary_table in temporary_tables:
9552
9553                # First temporary table
9554                if not query_merge:
9555                    query_merge = f"""
9556                        SELECT * FROM {temporary_table}
9557                    """
9558                # other temporary table (using UNION)
9559                else:
9560                    query_merge += f"""
9561                        UNION BY NAME SELECT * FROM {temporary_table}
9562                    """
9563
9564            # Merge on transcript
9565            query_merge_on_transcripts_annotation_fields = []
9566            # Aggregate all annotations fields
9567            for annotation_field in set(annotation_fields):
9568                query_merge_on_transcripts_annotation_fields.append(
9569                    f""" list_aggregate(list_distinct(array_agg({annotation_field})), 'string_agg', ',') AS {annotation_field} """
9570                )
9571            # Query for transcripts view
9572            query_merge_on_transcripts = f"""
9573                SELECT "#CHROM", POS, REF, ALT, transcript, {", ".join(query_merge_on_transcripts_annotation_fields)}
9574                FROM ({query_merge})
9575                GROUP BY "#CHROM", POS, REF, ALT, transcript
9576            """
9577
9578            # Drop transcript view is necessary
9579            if transcripts_table_drop:
9580                query_drop = f"""
9581                    DROP TABLE IF EXISTS {transcripts_table};
9582                """
9583                self.execute_query(query=query_drop)
9584
9585            # Merge and create transcript view
9586            query_create_view = f"""
9587                CREATE TABLE IF NOT EXISTS {transcripts_table}
9588                AS {query_merge_on_transcripts}
9589            """
9590            self.execute_query(query=query_create_view)
9591
9592            # Remove added columns
9593            for added_column in added_columns:
9594                self.drop_column(column=added_column)
9595
9596        else:
9597
9598            transcripts_table = None
9599
9600        return transcripts_table
9601
9602    def annotation_format_to_table(
9603        self,
9604        uniquify: bool = True,
9605        annotation_field: str = "ANN",
9606        annotation_id: str = "Feature_ID",
9607        view_name: str = "transcripts",
9608    ) -> str:
9609        """
9610        The function `annotation_format_to_table` converts annotation data from a VCF file into a structured
9611        table format.
9612
9613        :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure unique
9614        values in the output or not. If set to `True`, the function will make sure that the output values
9615        are unique, defaults to True
9616        :type uniquify: bool (optional)
9617        :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file that
9618        contains the annotation information for each variant. This field is used to extract the annotation
9619        details for further processing in the function, defaults to ANN
9620        :type annotation_field: str (optional)
9621        :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method is
9622        used to specify the identifier for the annotation feature. This identifier will be used as a column
9623        name in the resulting table or view that is created based on the annotation data. It helps in
9624        uniquely identifying each annotation entry in the, defaults to Feature_ID
9625        :type annotation_id: str (optional)
9626        :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used to
9627        specify the name of the temporary table that will be created to store the transformed annotation
9628        data. This table will hold the extracted information from the annotation field in a structured
9629        format for further processing or analysis, defaults to transcripts
9630        :type view_name: str (optional)
9631        :return: The function `annotation_format_to_table` is returning the name of the view created, which
9632        is stored in the variable `view_name`.
9633        """
9634
9635        # Annotation field
9636        annotation_format = "annotation_explode"
9637
9638        # Transcript annotation
9639        annotation_id = "".join(char for char in annotation_id if char.isalnum())
9640
9641        # Prefix
9642        prefix = self.get_explode_infos_prefix()
9643        if prefix:
9644            prefix = "INFO/"
9645
9646        # Annotation fields
9647        annotation_infos = prefix + annotation_field
9648        annotation_format_infos = prefix + annotation_format
9649
9650        # Variants table
9651        table_variants = self.get_table_variants()
9652
9653        # Header
9654        vcf_reader = self.get_header()
9655
9656        # Add columns
9657        added_columns = []
9658
9659        # Explode HGVS field in column
9660        added_columns += self.explode_infos(fields=[annotation_field])
9661
9662        if annotation_field in vcf_reader.infos:
9663
9664            # Extract ANN header
9665            ann_description = vcf_reader.infos[annotation_field].desc
9666            pattern = r"'(.+?)'"
9667            match = re.search(pattern, ann_description)
9668            if match:
9669                ann_header_match = match.group(1).split(" | ")
9670                ann_header = []
9671                ann_header_desc = {}
9672                for i in range(len(ann_header_match)):
9673                    ann_header_info = "".join(
9674                        char for char in ann_header_match[i] if char.isalnum()
9675                    )
9676                    ann_header.append(ann_header_info)
9677                    ann_header_desc[ann_header_info] = ann_header_match[i]
9678                if not ann_header_desc:
9679                    raise ValueError("Invalid header description format")
9680            else:
9681                raise ValueError("Invalid header description format")
9682
9683            # Create variant id
9684            variant_id_column = self.get_variant_id_column()
9685            added_columns += [variant_id_column]
9686
9687            # Create dataframe
9688            dataframe_annotation_format = self.get_query_to_df(
9689                f""" SELECT "#CHROM", POS, REF, ALT, "{variant_id_column}", "{annotation_infos}" FROM {table_variants} """
9690            )
9691
9692            # Create annotation columns
9693            dataframe_annotation_format[
9694                annotation_format_infos
9695            ] = dataframe_annotation_format[annotation_infos].apply(
9696                lambda x: explode_annotation_format(
9697                    annotation=str(x),
9698                    uniquify=uniquify,
9699                    output_format="JSON",
9700                    prefix="",
9701                    header=list(ann_header_desc.values()),
9702                )
9703            )
9704
9705            # Find keys
9706            query_json = f"""SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' FROM dataframe_annotation_format;"""
9707            df_keys = self.get_query_to_df(query=query_json)
9708
9709            # Check keys
9710            query_json_key = []
9711            for _, row in df_keys.iterrows():
9712
9713                # Key
9714                key = row.iloc[0]
9715
9716                # key_clean
9717                key_clean = "".join(char for char in key if char.isalnum())
9718
9719                # Type
9720                query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');"""
9721
9722                # Get DataFrame from query
9723                df_json_type = self.get_query_to_df(query=query_json_type)
9724
9725                # Fill missing values with empty strings and then replace empty strings or None with NaN and drop rows with NaN
9726                with pd.option_context("future.no_silent_downcasting", True):
9727                    df_json_type.fillna(value="", inplace=True)
9728                    replace_dict = {None: np.nan, "": np.nan}
9729                    df_json_type.replace(replace_dict, inplace=True)
9730                    df_json_type.dropna(inplace=True)
9731
9732                # Detect column type
9733                column_type = detect_column_type(df_json_type[key_clean])
9734
9735                # Append
9736                query_json_key.append(
9737                    f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type}  AS '{prefix}{key_clean}' """
9738                )
9739
9740            # Create view
9741            query_view = f"""CREATE TEMPORARY TABLE {view_name} AS (SELECT *, {annotation_id} AS 'transcript' FROM (SELECT "#CHROM", POS, REF, ALT, {",".join(query_json_key)} FROM dataframe_annotation_format));"""
9742            self.execute_query(query=query_view)
9743
9744        else:
9745
9746            # Return None
9747            view_name = None
9748
9749        # Remove added columns
9750        for added_column in added_columns:
9751            self.drop_column(column=added_column)
9752
9753        return view_name
9754
9755    def transcript_view_to_variants(
9756        self,
9757        transcripts_table: str = None,
9758        transcripts_column_id: str = None,
9759        transcripts_info_json: str = None,
9760        transcripts_info_field: str = None,
9761        param: dict = {},
9762    ) -> bool:
9763        """
9764        The function `transcript_view_to_variants` takes input parameters related to transcripts and updates
9765        a variants table with information from the transcripts in JSON format.
9766
9767        :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the table
9768        containing the transcripts data. If this parameter is not provided, the function will attempt to
9769        retrieve it from the `param` dictionary or use a default value of "transcripts"
9770        :type transcripts_table: str
9771        :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the column in
9772        the `transcripts_table` that contains the unique identifier for each transcript. This identifier is
9773        used to match transcripts with variants in the database
9774        :type transcripts_column_id: str
9775        :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name of
9776        the column in the variants table where the transcripts information will be stored in JSON format
9777        :type transcripts_info_json: str
9778        :param transcripts_info_field: The `transcripts_info_field` parameter is used to specify the field
9779        in the VCF header that will contain information about transcripts in JSON format. This field will be
9780        added to the VCF header as an INFO field with the specified name
9781        :type transcripts_info_field: str
9782        :param param: The `transcript_view_to_variants` method takes several parameters:
9783        :type param: dict
9784        :return: The function `transcript_view_to_variants` returns a boolean value, which is `True` if the
9785        operation is successful and `False` if certain conditions are not met.
9786        """
9787
9788        log.debug("Start transcripts view to JSON...")
9789
9790        # Default
9791        transcripts_table_default = "transcripts"
9792        transcripts_column_id_default = "transcript"
9793        transcripts_info_json_default = None
9794        transcripts_info_field_default = None
9795
9796        # Param
9797        if not param:
9798            param = self.get_param()
9799
9800        # Transcripts table
9801        if transcripts_table is None:
9802            transcripts_table = param.get("transcripts", {}).get(
9803                "table", transcripts_table_default
9804            )
9805
9806        # Transcripts column ID
9807        if transcripts_column_id is None:
9808            transcripts_column_id = param.get("transcripts", {}).get(
9809                "column_id", transcripts_column_id_default
9810            )
9811
9812        # Transcripts info field
9813        if transcripts_info_json is None:
9814            transcripts_info_json = param.get("transcripts", {}).get(
9815                "transcripts_info_json", transcripts_info_json_default
9816            )
9817
9818        # Transcripts info field
9819        if transcripts_info_field is None:
9820            transcripts_info_field = param.get("transcripts", {}).get(
9821                "transcripts_info_field", transcripts_info_field_default
9822            )
9823
9824        # Variants table
9825        table_variants = self.get_table_variants()
9826
9827        # Check info columns param
9828        if transcripts_info_json is None and transcripts_info_field is None:
9829            return False
9830
9831        # Transcripts infos columns
9832        query_transcripts_infos_columns = f"""
9833            SELECT *
9834            FROM (
9835                DESCRIBE SELECT * FROM {transcripts_table}
9836                )
9837            WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}')
9838        """
9839        transcripts_infos_columns = list(
9840            self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"]
9841        )
9842
9843        # View results
9844        clause_select = []
9845        clause_to_json = []
9846        for field in transcripts_infos_columns:
9847            clause_select.append(
9848                f""" regexp_split_to_table("{field}", ',') AS '{field}' """
9849            )
9850            clause_to_json.append(f""" '{field}': "{field}" """)
9851
9852        # Update
9853        update_set = []
9854
9855        # VCF header
9856        vcf_reader = self.get_header()
9857
9858        # Transcripts to info column in JSON
9859        if transcripts_info_json is not None:
9860
9861            # Create column on variants table
9862            self.add_column(
9863                table_name=table_variants,
9864                column_name=transcripts_info_json,
9865                column_type="JSON",
9866                default_value=None,
9867                drop=False,
9868            )
9869
9870            # Add to update
9871            update_set.append(
9872                f""" {transcripts_info_json}=t.{transcripts_info_json} """
9873            )
9874
9875            # Add header
9876            vcf_reader.infos[transcripts_info_json] = vcf.parser._Info(
9877                transcripts_info_json,
9878                ".",
9879                "String",
9880                "Transcripts in JSON format",
9881                "unknwon",
9882                "unknwon",
9883                self.code_type_map["String"],
9884            )
9885
9886        # Transcripts to info field in JSON
9887        if transcripts_info_field is not None:
9888
9889            # Add to update
9890            update_set.append(
9891                f""" 
9892                    INFO = concat(
9893                            CASE
9894                                WHEN INFO NOT IN ('', '.')
9895                                THEN INFO
9896                                ELSE ''
9897                            END,
9898                            CASE
9899                                WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.')
9900                                THEN concat(
9901                                    ';{transcripts_info_field}=',
9902                                    t.{transcripts_info_json}
9903                                )
9904                                ELSE ''
9905                            END
9906                            )
9907                """
9908            )
9909
9910            # Add header
9911            vcf_reader.infos[transcripts_info_field] = vcf.parser._Info(
9912                transcripts_info_field,
9913                ".",
9914                "String",
9915                "Transcripts in JSON format",
9916                "unknwon",
9917                "unknwon",
9918                self.code_type_map["String"],
9919            )
9920
9921        # Update query
9922        query_update = f"""
9923            UPDATE {table_variants}
9924                SET {", ".join(update_set)}
9925            FROM
9926            (
9927                SELECT
9928                    "#CHROM", POS, REF, ALT,
9929                        concat(
9930                        '{{',
9931                        string_agg(
9932                            '"' || "{transcripts_column_id}" || '":' ||
9933                            to_json(json_output)
9934                        ),
9935                        '}}'
9936                        )::JSON AS {transcripts_info_json}
9937                FROM
9938                    (
9939                    SELECT
9940                        "#CHROM", POS, REF, ALT,
9941                        "{transcripts_column_id}",
9942                        to_json(
9943                            {{{",".join(clause_to_json)}}}
9944                        )::JSON AS json_output
9945                    FROM
9946                        (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
9947                    WHERE "{transcripts_column_id}" IS NOT NULL
9948                    )
9949                GROUP BY "#CHROM", POS, REF, ALT
9950            ) AS t
9951            WHERE {table_variants}."#CHROM" = t."#CHROM"
9952                AND {table_variants}."POS" = t."POS"
9953                AND {table_variants}."REF" = t."REF"
9954                AND {table_variants}."ALT" = t."ALT"
9955        """
9956
9957        self.execute_query(query=query_update)
9958
9959        return True
class Variants:
  34class Variants:
  35
  36    def __init__(
  37        self,
  38        conn=None,
  39        input: str = None,
  40        output: str = None,
  41        config: dict = {},
  42        param: dict = {},
  43        load: bool = False,
  44    ) -> None:
  45        """
  46        The function `__init__` initializes the variables, sets the input, output, config, param, connexion and
  47        header
  48
  49        :param conn: the connection to the database
  50        :param input: the input file
  51        :param output: the output file
  52        :param config: a dictionary containing the configuration of the model
  53        :param param: a dictionary containing the parameters of the model
  54        """
  55
  56        # Init variables
  57        self.init_variables()
  58
  59        # Input
  60        self.set_input(input)
  61
  62        # Config
  63        self.set_config(config)
  64
  65        # Param
  66        self.set_param(param)
  67
  68        # Output
  69        self.set_output(output)
  70
  71        # connexion
  72        self.set_connexion(conn)
  73
  74        # Header
  75        self.set_header()
  76
  77        # Load data
  78        if load:
  79            self.load_data()
  80
  81    def set_input(self, input: str = None) -> None:
  82        """
  83        The function `set_input` takes a file name as input, extracts the name and extension, and sets
  84        attributes in the class accordingly.
  85
  86        :param input: The `set_input` method in the provided code snippet is used to set attributes
  87        related to the input file. Here's a breakdown of the parameters and their usage in the method:
  88        :type input: str
  89        """
  90
  91        if input and not isinstance(input, str):
  92            try:
  93                self.input = input.name
  94            except:
  95                log.error(f"Input file '{input} in bad format")
  96                raise ValueError(f"Input file '{input} in bad format")
  97        else:
  98            self.input = input
  99
 100        # Input format
 101        if input:
 102            input_name, input_extension = os.path.splitext(self.input)
 103            self.input_name = input_name
 104            self.input_extension = input_extension
 105            self.input_format = self.input_extension.replace(".", "")
 106
 107    def set_config(self, config: dict) -> None:
 108        """
 109        The set_config function takes a config object and assigns it as the configuration object for the
 110        class.
 111
 112        :param config: The `config` parameter in the `set_config` function is a dictionary object that
 113        contains configuration settings for the class. When you call the `set_config` function with a
 114        dictionary object as the argument, it will set that dictionary as the configuration object for
 115        the class
 116        :type config: dict
 117        """
 118
 119        self.config = config
 120
 121    def set_param(self, param: dict) -> None:
 122        """
 123        This function sets a parameter object for the class based on the input dictionary.
 124
 125        :param param: The `set_param` method you provided takes a dictionary object as input and sets it
 126        as the `param` attribute of the class instance
 127        :type param: dict
 128        """
 129
 130        self.param = param
 131
 132    def init_variables(self) -> None:
 133        """
 134        This function initializes the variables that will be used in the rest of the class
 135        """
 136
 137        self.prefix = "howard"
 138        self.table_variants = "variants"
 139        self.dataframe = None
 140
 141        self.comparison_map = {
 142            "gt": ">",
 143            "gte": ">=",
 144            "lt": "<",
 145            "lte": "<=",
 146            "equals": "=",
 147            "contains": "SIMILAR TO",
 148        }
 149
 150        self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3}
 151
 152        self.code_type_map_to_sql = {
 153            "Integer": "INTEGER",
 154            "String": "VARCHAR",
 155            "Float": "FLOAT",
 156            "Flag": "VARCHAR",
 157        }
 158
 159        self.index_additionnal_fields = []
 160
 161    def get_indexing(self) -> bool:
 162        """
 163        It returns the value of the key "indexing" in the dictionary. If the key is not present, it
 164        returns False.
 165        :return: The value of the indexing parameter.
 166        """
 167
 168        return self.get_param().get("indexing", False)
 169
 170    def get_connexion_config(self) -> dict:
 171        """
 172        The function `get_connexion_config` returns a dictionary containing the configuration for a
 173        connection, including the number of threads and memory limit.
 174        :return: a dictionary containing the configuration for the Connexion library.
 175        """
 176
 177        # config
 178        config = self.get_config()
 179
 180        # Connexion config
 181        connexion_config = {}
 182        threads = self.get_threads()
 183
 184        # Threads
 185        if threads:
 186            connexion_config["threads"] = threads
 187
 188        # Memory
 189        # if config.get("memory", None):
 190        #     connexion_config["memory_limit"] = config.get("memory")
 191        if self.get_memory():
 192            connexion_config["memory_limit"] = self.get_memory()
 193
 194        # Temporary directory
 195        if config.get("tmp", None):
 196            connexion_config["temp_directory"] = config.get("tmp")
 197
 198        # Access
 199        if config.get("access", None):
 200            access = config.get("access")
 201            if access in ["RO"]:
 202                access = "READ_ONLY"
 203            elif access in ["RW"]:
 204                access = "READ_WRITE"
 205            connexion_db = self.get_connexion_db()
 206            if connexion_db in ":memory:":
 207                access = "READ_WRITE"
 208            connexion_config["access_mode"] = access
 209
 210        return connexion_config
 211
 212    def get_duckdb_settings(self) -> dict:
 213        """
 214        The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a
 215        string.
 216        :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`.
 217        """
 218
 219        # config
 220        config = self.get_config()
 221
 222        # duckdb settings
 223        duckdb_settings_dict = {}
 224        if config.get("duckdb_settings", None):
 225            duckdb_settings = config.get("duckdb_settings")
 226            duckdb_settings = full_path(duckdb_settings)
 227            # duckdb setting is a file
 228            if os.path.exists(duckdb_settings):
 229                with open(duckdb_settings) as json_file:
 230                    duckdb_settings_dict = yaml.safe_load(json_file)
 231            # duckdb settings is a string
 232            else:
 233                duckdb_settings_dict = json.loads(duckdb_settings)
 234
 235        return duckdb_settings_dict
 236
 237    def set_connexion_db(self) -> str:
 238        """
 239        The function `set_connexion_db` returns the appropriate database connection string based on the
 240        input format and connection type.
 241        :return: the value of the variable `connexion_db`.
 242        """
 243
 244        # Default connexion db
 245        default_connexion_db = ":memory:"
 246
 247        # Find connexion db
 248        if self.get_input_format() in ["db", "duckdb"]:
 249            connexion_db = self.get_input()
 250        elif self.get_connexion_type() in ["memory", default_connexion_db, None]:
 251            connexion_db = default_connexion_db
 252        elif self.get_connexion_type() in ["tmpfile"]:
 253            tmp_name = tempfile.mkdtemp(
 254                prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db"
 255            )
 256            connexion_db = f"{tmp_name}/tmp.db"
 257        elif self.get_connexion_type() != "":
 258            connexion_db = self.get_connexion_type()
 259        else:
 260            connexion_db = default_connexion_db
 261
 262        # Set connexion db
 263        self.connexion_db = connexion_db
 264
 265        return connexion_db
 266
 267    def set_connexion(self, conn) -> None:
 268        """
 269        The function `set_connexion` creates a connection to a database, with options for different
 270        database formats and settings.
 271
 272        :param conn: The `conn` parameter in the `set_connexion` method is the connection to the
 273        database. If a connection is not provided, a new connection to an in-memory database is created.
 274        The method then proceeds to set up the connection based on the specified format (e.g., duckdb or
 275        sqlite
 276        """
 277
 278        # Connexion db
 279        connexion_db = self.set_connexion_db()
 280
 281        # Connexion config
 282        connexion_config = self.get_connexion_config()
 283
 284        # Connexion format
 285        connexion_format = self.get_config().get("connexion_format", "duckdb")
 286        # Set connexion format
 287        self.connexion_format = connexion_format
 288
 289        # Connexion
 290        if not conn:
 291            if connexion_format in ["duckdb"]:
 292                conn = duckdb.connect(connexion_db, config=connexion_config)
 293                # duckDB settings
 294                duckdb_settings = self.get_duckdb_settings()
 295                if duckdb_settings:
 296                    for setting in duckdb_settings:
 297                        setting_value = duckdb_settings.get(setting)
 298                        if isinstance(setting_value, str):
 299                            setting_value = f"'{setting_value}'"
 300                        conn.execute(f"PRAGMA {setting}={setting_value};")
 301            elif connexion_format in ["sqlite"]:
 302                conn = sqlite3.connect(connexion_db)
 303
 304        # Set connexion
 305        self.conn = conn
 306
 307        # Log
 308        log.debug(f"connexion_format: {connexion_format}")
 309        log.debug(f"connexion_db: {connexion_db}")
 310        log.debug(f"connexion config: {connexion_config}")
 311        log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}")
 312
 313    def set_output(self, output: str = None) -> None:
 314        """
 315        The `set_output` function in Python sets the output file based on the input or a specified key
 316        in the config file, extracting the output name, extension, and format.
 317
 318        :param output: The `output` parameter in the `set_output` method is used to specify the name of
 319        the output file. If the config file has an 'output' key, the method sets the output to the value
 320        of that key. If no output is provided, it sets the output to `None`
 321        :type output: str
 322        """
 323
 324        if output and not isinstance(output, str):
 325            self.output = output.name
 326        else:
 327            self.output = output
 328
 329        # Output format
 330        if self.output:
 331            output_name, output_extension = os.path.splitext(self.output)
 332            self.output_name = output_name
 333            self.output_extension = output_extension
 334            self.output_format = self.output_extension.replace(".", "")
 335        else:
 336            self.output_name = None
 337            self.output_extension = None
 338            self.output_format = None
 339
 340    def set_header(self) -> None:
 341        """
 342        It reads the header of a VCF file and stores it as a list of strings and as a VCF object
 343        """
 344
 345        input_file = self.get_input()
 346        default_header_list = [
 347            "##fileformat=VCFv4.2",
 348            "#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO",
 349        ]
 350
 351        # Full path
 352        input_file = full_path(input_file)
 353
 354        if input_file:
 355
 356            input_format = self.get_input_format()
 357            input_compressed = self.get_input_compressed()
 358            config = self.get_config()
 359            header_list = default_header_list
 360            if input_format in [
 361                "vcf",
 362                "hdr",
 363                "tsv",
 364                "csv",
 365                "psv",
 366                "parquet",
 367                "db",
 368                "duckdb",
 369            ]:
 370                # header provided in param
 371                if config.get("header_file", None):
 372                    with open(config.get("header_file"), "rt") as f:
 373                        header_list = self.read_vcf_header(f)
 374                # within a vcf file format (header within input file itsself)
 375                elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file):
 376                    # within a compressed vcf file format (.vcf.gz)
 377                    if input_compressed:
 378                        with bgzf.open(input_file, "rt") as f:
 379                            header_list = self.read_vcf_header(f)
 380                    # within an uncompressed vcf file format (.vcf)
 381                    else:
 382                        with open(input_file, "rt") as f:
 383                            header_list = self.read_vcf_header(f)
 384                # header provided in default external file .hdr
 385                elif os.path.exists((input_file + ".hdr")):
 386                    with open(input_file + ".hdr", "rt") as f:
 387                        header_list = self.read_vcf_header(f)
 388                else:
 389                    try:  # Try to get header info fields and file columns
 390
 391                        with tempfile.TemporaryDirectory() as tmpdir:
 392
 393                            # Create database
 394                            db_for_header = Database(database=input_file)
 395
 396                            # Get header columns for infos fields
 397                            db_header_from_columns = (
 398                                db_for_header.get_header_from_columns()
 399                            )
 400
 401                            # Get real columns in the file
 402                            db_header_columns = db_for_header.get_columns()
 403
 404                            # Write header file
 405                            header_file_tmp = os.path.join(tmpdir, "header")
 406                            f = open(header_file_tmp, "w")
 407                            vcf.Writer(f, db_header_from_columns)
 408                            f.close()
 409
 410                            # Replace #CHROM line with rel columns
 411                            header_list = db_for_header.read_header_file(
 412                                header_file=header_file_tmp
 413                            )
 414                            header_list[-1] = "\t".join(db_header_columns)
 415
 416                    except:
 417
 418                        log.warning(
 419                            f"No header for file {input_file}. Set as default VCF header"
 420                        )
 421                        header_list = default_header_list
 422
 423            else:  # try for unknown format ?
 424
 425                log.error(f"Input file format '{input_format}' not available")
 426                raise ValueError(f"Input file format '{input_format}' not available")
 427
 428            if not header_list:
 429                header_list = default_header_list
 430
 431            # header as list
 432            self.header_list = header_list
 433
 434            # header as VCF object
 435            self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list)))
 436
 437        else:
 438
 439            self.header_list = None
 440            self.header_vcf = None
 441
 442    def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame:
 443        """
 444        The `get_query_to_df` function takes a query as a string and returns the result as a pandas
 445        DataFrame based on the connection format.
 446
 447        :param query: The `query` parameter in the `get_query_to_df` function is a string that
 448        represents the SQL query you want to execute. This query will be used to fetch data from a
 449        database and convert it into a pandas DataFrame
 450        :type query: str
 451        :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the
 452        maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the
 453        function will only fetch up to that number of rows from the database query result. If no limit
 454        is specified,
 455        :type limit: int
 456        :return: A pandas DataFrame is being returned by the `get_query_to_df` function.
 457        """
 458
 459        # Connexion format
 460        connexion_format = self.get_connexion_format()
 461
 462        # Limit in query
 463        if limit:
 464            pd.set_option("display.max_rows", limit)
 465            if connexion_format in ["duckdb"]:
 466                df = (
 467                    self.conn.execute(query)
 468                    .fetch_record_batch(limit)
 469                    .read_next_batch()
 470                    .to_pandas()
 471                )
 472            elif connexion_format in ["sqlite"]:
 473                df = next(pd.read_sql_query(query, self.conn, chunksize=limit))
 474
 475        # Full query
 476        else:
 477            if connexion_format in ["duckdb"]:
 478                df = self.conn.execute(query).df()
 479            elif connexion_format in ["sqlite"]:
 480                df = pd.read_sql_query(query, self.conn)
 481
 482        return df
 483
 484    def get_overview(self) -> None:
 485        """
 486        The function prints the input, output, config, and dataframe of the current object
 487        """
 488        table_variants_from = self.get_table_variants(clause="from")
 489        sql_columns = self.get_header_columns_as_sql()
 490        sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}"
 491        df = self.get_query_to_df(sql_query_export)
 492        log.info(
 493            "Input:  "
 494            + str(self.get_input())
 495            + " ["
 496            + str(str(self.get_input_format()))
 497            + "]"
 498        )
 499        log.info(
 500            "Output: "
 501            + str(self.get_output())
 502            + " ["
 503            + str(str(self.get_output_format()))
 504            + "]"
 505        )
 506        log.info("Config: ")
 507        for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split(
 508            "\n"
 509        ):
 510            log.info("\t" + str(d))
 511        log.info("Param: ")
 512        for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split(
 513            "\n"
 514        ):
 515            log.info("\t" + str(d))
 516        log.info("Sample list: " + str(self.get_header_sample_list()))
 517        log.info("Dataframe: ")
 518        for d in str(df).split("\n"):
 519            log.info("\t" + str(d))
 520
 521        # garbage collector
 522        del df
 523        gc.collect()
 524
 525        return None
 526
 527    def get_stats(self) -> dict:
 528        """
 529        The `get_stats` function calculates and returns various statistics of the current object,
 530        including information about the input file, variants, samples, header fields, quality, and
 531        SNVs/InDels.
 532        :return: a dictionary containing various statistics of the current object. The dictionary has
 533        the following structure:
 534        """
 535
 536        # Log
 537        log.info(f"Stats Calculation...")
 538
 539        # table varaints
 540        table_variants_from = self.get_table_variants()
 541
 542        # stats dict
 543        stats = {"Infos": {}}
 544
 545        ### File
 546        input_file = self.get_input()
 547        stats["Infos"]["Input file"] = input_file
 548
 549        # Header
 550        header_infos = self.get_header().infos
 551        header_formats = self.get_header().formats
 552        header_infos_list = list(header_infos)
 553        header_formats_list = list(header_formats)
 554
 555        ### Variants
 556
 557        stats["Variants"] = {}
 558
 559        # Variants by chr
 560        sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"'
 561        df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom)
 562        nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values(
 563            by=["CHROM"], kind="quicksort"
 564        )
 565
 566        # Total number of variants
 567        nb_of_variants = nb_of_variants_by_chrom["count"].sum()
 568
 569        # Calculate percentage
 570        nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply(
 571            lambda x: (x / nb_of_variants)
 572        )
 573
 574        stats["Variants"]["Number of variants by chromosome"] = (
 575            nb_of_variants_by_chrom.to_dict(orient="index")
 576        )
 577
 578        stats["Infos"]["Number of variants"] = int(nb_of_variants)
 579
 580        ### Samples
 581
 582        # Init
 583        samples = {}
 584        nb_of_samples = 0
 585
 586        # Check Samples
 587        if "GT" in header_formats_list and "FORMAT" in self.get_header_columns():
 588            log.debug(f"Check samples...")
 589            for sample in self.get_header_sample_list():
 590                sql_query_samples = f"""
 591                    SELECT  '{sample}' as sample,
 592                            REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype,
 593                            count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count,
 594                            concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage
 595                    FROM {table_variants_from}
 596                    WHERE (
 597                        regexp_matches("{sample}", '^[0-9]([/|][0-9])+')
 598                        AND
 599                        len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':'))
 600                      )
 601                    GROUP BY genotype
 602                    """
 603                sql_query_genotype_df = self.conn.execute(sql_query_samples).df()
 604                sample_genotype_count = sql_query_genotype_df["count"].sum()
 605                if len(sql_query_genotype_df):
 606                    nb_of_samples += 1
 607                    samples[f"{sample} - {sample_genotype_count} variants"] = (
 608                        sql_query_genotype_df.to_dict(orient="index")
 609                    )
 610
 611            stats["Samples"] = samples
 612            stats["Infos"]["Number of samples"] = nb_of_samples
 613
 614        # #
 615        # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list:
 616        #     stats["Infos"]["Number of samples"] = nb_of_samples
 617        # elif nb_of_samples:
 618        #     stats["Infos"]["Number of samples"] = "not a VCF format"
 619
 620        ### INFO and FORMAT fields
 621        header_types_df = {}
 622        header_types_list = {
 623            "List of INFO fields": header_infos,
 624            "List of FORMAT fields": header_formats,
 625        }
 626        i = 0
 627        for header_type in header_types_list:
 628
 629            header_type_infos = header_types_list.get(header_type)
 630            header_infos_dict = {}
 631
 632            for info in header_type_infos:
 633
 634                i += 1
 635                header_infos_dict[i] = {}
 636
 637                # ID
 638                header_infos_dict[i]["id"] = info
 639
 640                # num
 641                genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"}
 642                if header_type_infos[info].num in genotype_map.keys():
 643                    header_infos_dict[i]["Number"] = genotype_map.get(
 644                        header_type_infos[info].num
 645                    )
 646                else:
 647                    header_infos_dict[i]["Number"] = header_type_infos[info].num
 648
 649                # type
 650                if header_type_infos[info].type:
 651                    header_infos_dict[i]["Type"] = header_type_infos[info].type
 652                else:
 653                    header_infos_dict[i]["Type"] = "."
 654
 655                # desc
 656                if header_type_infos[info].desc != None:
 657                    header_infos_dict[i]["Description"] = header_type_infos[info].desc
 658                else:
 659                    header_infos_dict[i]["Description"] = ""
 660
 661            if len(header_infos_dict):
 662                header_types_df[header_type] = pd.DataFrame.from_dict(
 663                    header_infos_dict, orient="index"
 664                ).to_dict(orient="index")
 665
 666        # Stats
 667        stats["Infos"]["Number of INFO fields"] = len(header_infos_list)
 668        stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list)
 669        stats["Header"] = header_types_df
 670
 671        ### QUAL
 672        if "QUAL" in self.get_header_columns():
 673            sql_query_qual = f"""
 674                    SELECT
 675                        avg(CAST(QUAL AS INTEGER)) AS Average,
 676                        min(CAST(QUAL AS INTEGER)) AS Minimum,
 677                        max(CAST(QUAL AS INTEGER)) AS Maximum,
 678                        stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation,
 679                        median(CAST(QUAL AS INTEGER)) AS Median,
 680                        variance(CAST(QUAL AS INTEGER)) AS Variance
 681                    FROM {table_variants_from}
 682                    WHERE CAST(QUAL AS VARCHAR) NOT IN ('.')
 683                    """
 684
 685            qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index")
 686            stats["Quality"] = {"Stats": qual}
 687
 688        ### SNV and InDel
 689
 690        sql_query_snv = f"""
 691            
 692            SELECT Type, count FROM (
 693
 694                    SELECT
 695                        'Total' AS Type,
 696                        count(*) AS count
 697                    FROM {table_variants_from}
 698
 699                    UNION
 700
 701                    SELECT
 702                        'MNV' AS Type,
 703                        count(*) AS count
 704                    FROM {table_variants_from}
 705                    WHERE len(REF) > 1 AND len(ALT) > 1
 706                    AND len(REF) = len(ALT)
 707
 708                    UNION
 709
 710                    SELECT
 711                        'InDel' AS Type,
 712                        count(*) AS count
 713                    FROM {table_variants_from}
 714                    WHERE len(REF) > 1 OR len(ALT) > 1
 715                    AND len(REF) != len(ALT)
 716                    
 717                    UNION
 718
 719                    SELECT
 720                        'SNV' AS Type,
 721                        count(*) AS count
 722                    FROM {table_variants_from}
 723                    WHERE len(REF) = 1 AND len(ALT) = 1
 724
 725                )
 726
 727            ORDER BY count DESC
 728
 729                """
 730        snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index")
 731
 732        sql_query_snv_substitution = f"""
 733                SELECT
 734                    concat(REF, '>', ALT) AS 'Substitution',
 735                    count(*) AS count
 736                FROM {table_variants_from}
 737                WHERE len(REF) = 1 AND len(ALT) = 1
 738                GROUP BY REF, ALT
 739                ORDER BY count(*) DESC
 740                """
 741        snv_substitution = (
 742            self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index")
 743        )
 744        stats["Variants"]["Counts"] = snv_indel
 745        stats["Variants"]["Substitutions"] = snv_substitution
 746
 747        return stats
 748
 749    def stats_to_file(self, file: str = None) -> str:
 750        """
 751        The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them
 752        into a JSON object, and writes the JSON object to the specified file.
 753
 754        :param file: The `file` parameter is a string that represents the file path where the JSON data
 755        will be written
 756        :type file: str
 757        :return: the name of the file that was written to.
 758        """
 759
 760        # Get stats
 761        stats = self.get_stats()
 762
 763        # Serializing json
 764        json_object = json.dumps(stats, indent=4)
 765
 766        # Writing to sample.json
 767        with open(file, "w") as outfile:
 768            outfile.write(json_object)
 769
 770        return file
 771
 772    def print_stats(self, output_file: str = None, json_file: str = None) -> None:
 773        """
 774        The `print_stats` function generates a markdown file and prints the statistics contained in a
 775        JSON file in a formatted manner.
 776
 777        :param output_file: The `output_file` parameter is a string that specifies the path and filename
 778        of the output file where the stats will be printed in Markdown format. If no `output_file` is
 779        provided, a temporary directory will be created and the stats will be saved in a file named
 780        "stats.md" within that
 781        :type output_file: str
 782        :param json_file: The `json_file` parameter is a string that represents the path to the JSON
 783        file where the statistics will be saved. If no value is provided, a temporary directory will be
 784        created and a default file name "stats.json" will be used
 785        :type json_file: str
 786        :return: The function `print_stats` does not return any value. It has a return type annotation
 787        of `None`.
 788        """
 789
 790        # Full path
 791        output_file = full_path(output_file)
 792        json_file = full_path(json_file)
 793
 794        with tempfile.TemporaryDirectory() as tmpdir:
 795
 796            # Files
 797            if not output_file:
 798                output_file = os.path.join(tmpdir, "stats.md")
 799            if not json_file:
 800                json_file = os.path.join(tmpdir, "stats.json")
 801
 802            # Create folders
 803            if not os.path.exists(os.path.dirname(output_file)):
 804                Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True)
 805            if not os.path.exists(os.path.dirname(json_file)):
 806                Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True)
 807
 808            # Create stats JSON file
 809            stats_file = self.stats_to_file(file=json_file)
 810
 811            # Print stats file
 812            with open(stats_file) as f:
 813                stats = yaml.safe_load(f)
 814
 815            # Output
 816            output_title = []
 817            output_index = []
 818            output = []
 819
 820            # Title
 821            output_title.append("# HOWARD Stats")
 822
 823            # Index
 824            output_index.append("## Index")
 825
 826            # Process sections
 827            for section in stats:
 828                infos = stats.get(section)
 829                section_link = "#" + section.lower().replace(" ", "-")
 830                output.append(f"## {section}")
 831                output_index.append(f"- [{section}]({section_link})")
 832
 833                if len(infos):
 834                    for info in infos:
 835                        try:
 836                            df = pd.DataFrame.from_dict(infos.get(info), orient="index")
 837                            is_df = True
 838                        except:
 839                            try:
 840                                df = pd.DataFrame.from_dict(
 841                                    json.loads((infos.get(info))), orient="index"
 842                                )
 843                                is_df = True
 844                            except:
 845                                is_df = False
 846                        if is_df:
 847                            output.append(f"### {info}")
 848                            info_link = "#" + info.lower().replace(" ", "-")
 849                            output_index.append(f"   - [{info}]({info_link})")
 850                            output.append(f"{df.to_markdown(index=False)}")
 851                        else:
 852                            output.append(f"- {info}: {infos.get(info)}")
 853                else:
 854                    output.append(f"NA")
 855
 856            # Write stats in markdown file
 857            with open(output_file, "w") as fp:
 858                for item in output_title:
 859                    fp.write("%s\n" % item)
 860                for item in output_index:
 861                    fp.write("%s\n" % item)
 862                for item in output:
 863                    fp.write("%s\n" % item)
 864
 865            # Output stats in markdown
 866            print("")
 867            print("\n\n".join(output_title))
 868            print("")
 869            print("\n\n".join(output))
 870            print("")
 871
 872        return None
 873
 874    def get_input(self) -> str:
 875        """
 876        It returns the value of the input variable.
 877        :return: The input is being returned.
 878        """
 879        return self.input
 880
 881    def get_input_format(self, input_file: str = None) -> str:
 882        """
 883        This function returns the format of the input variable, either from the provided input file or
 884        by prompting for input.
 885
 886        :param input_file: The `input_file` parameter in the `get_input_format` method is a string that
 887        represents the file path of the input file. If no `input_file` is provided when calling the
 888        method, it will default to `None`
 889        :type input_file: str
 890        :return: The format of the input variable is being returned.
 891        """
 892
 893        if not input_file:
 894            input_file = self.get_input()
 895        input_format = get_file_format(input_file)
 896        return input_format
 897
 898    def get_input_compressed(self, input_file: str = None) -> str:
 899        """
 900        The function `get_input_compressed` returns the format of the input variable after compressing
 901        it.
 902
 903        :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string
 904        that represents the file path of the input file. If no `input_file` is provided when calling the
 905        method, it will default to `None` and the method will then call `self.get_input()` to
 906        :type input_file: str
 907        :return: The function `get_input_compressed` returns the compressed format of the input
 908        variable.
 909        """
 910
 911        if not input_file:
 912            input_file = self.get_input()
 913        input_compressed = get_file_compressed(input_file)
 914        return input_compressed
 915
 916    def get_output(self) -> str:
 917        """
 918        It returns the output of the neuron.
 919        :return: The output of the neural network.
 920        """
 921
 922        return self.output
 923
 924    def get_output_format(self, output_file: str = None) -> str:
 925        """
 926        The function `get_output_format` returns the format of the input variable or the output file if
 927        provided.
 928
 929        :param output_file: The `output_file` parameter in the `get_output_format` method is a string
 930        that represents the file path of the output file. If no `output_file` is provided when calling
 931        the method, it will default to the output obtained from the `get_output` method of the class
 932        instance. The
 933        :type output_file: str
 934        :return: The format of the input variable is being returned.
 935        """
 936
 937        if not output_file:
 938            output_file = self.get_output()
 939        output_format = get_file_format(output_file)
 940
 941        return output_format
 942
 943    def get_config(self) -> dict:
 944        """
 945        It returns the config
 946        :return: The config variable is being returned.
 947        """
 948        return self.config
 949
 950    def get_param(self) -> dict:
 951        """
 952        It returns the param
 953        :return: The param variable is being returned.
 954        """
 955        return self.param
 956
 957    def get_connexion_db(self) -> str:
 958        """
 959        It returns the connexion_db attribute of the object
 960        :return: The connexion_db is being returned.
 961        """
 962        return self.connexion_db
 963
 964    def get_prefix(self) -> str:
 965        """
 966        It returns the prefix of the object.
 967        :return: The prefix is being returned.
 968        """
 969        return self.prefix
 970
 971    def get_table_variants(self, clause: str = "select") -> str:
 972        """
 973        This function returns the table_variants attribute of the object
 974
 975        :param clause: the type of clause the table will be used. Either "select" or "from" (optional),
 976        defaults to select (optional)
 977        :return: The table_variants attribute of the object.
 978        """
 979
 980        # Access
 981        access = self.get_config().get("access", None)
 982
 983        # Clauses "select", "where", "update"
 984        if clause in ["select", "where", "update"]:
 985            table_variants = self.table_variants
 986        # Clause "from"
 987        elif clause in ["from"]:
 988            # For Read Only
 989            if self.get_input_format() in ["parquet"] and access in ["RO"]:
 990                input_file = self.get_input()
 991                table_variants = f"'{input_file}' as variants"
 992            # For Read Write
 993            else:
 994                table_variants = f"{self.table_variants} as variants"
 995        else:
 996            table_variants = self.table_variants
 997        return table_variants
 998
 999    def get_tmp_dir(self) -> str:
1000        """
1001        The function `get_tmp_dir` returns the temporary directory path based on configuration
1002        parameters or a default path.
1003        :return: The `get_tmp_dir` method is returning the temporary directory path based on the
1004        configuration, parameters, and a default value of "/tmp".
1005        """
1006
1007        return get_tmp(
1008            config=self.get_config(), param=self.get_param(), default_tmp="/tmp"
1009        )
1010
1011    def get_connexion_type(self) -> str:
1012        """
1013        If the connexion type is not in the list of allowed connexion types, raise a ValueError
1014
1015        :return: The connexion type is being returned.
1016        """
1017        return self.get_config().get("connexion_type", "memory")
1018
1019    def get_connexion(self):
1020        """
1021        It returns the connection object
1022
1023        :return: The connection object.
1024        """
1025        return self.conn
1026
1027    def close_connexion(self) -> None:
1028        """
1029        This function closes the connection to the database.
1030        :return: The connection is being closed.
1031        """
1032        return self.conn.close()
1033
1034    def get_header(self, type: str = "vcf"):
1035        """
1036        This function returns the header of the VCF file as a list of strings
1037
1038        :param type: the type of header you want to get, defaults to vcf (optional)
1039        :return: The header of the vcf file.
1040        """
1041
1042        if self.header_vcf:
1043            if type == "vcf":
1044                return self.header_vcf
1045            elif type == "list":
1046                return self.header_list
1047        else:
1048            if type == "vcf":
1049                header = vcf.Reader(io.StringIO("\n".join(vcf_required)))
1050                return header
1051            elif type == "list":
1052                return vcf_required
1053
1054    def get_header_length(self, file: str = None) -> int:
1055        """
1056        The function `get_header_length` returns the length of the header list, excluding the #CHROM
1057        line.
1058
1059        :param file: The `file` parameter is an optional argument that specifies the path to a VCF
1060        header file. If this argument is provided, the function will read the header from the specified
1061        file and return the length of the header list minus 1 (to exclude the #CHROM line)
1062        :type file: str
1063        :return: the length of the header list, excluding the #CHROM line.
1064        """
1065
1066        if file:
1067            return len(self.read_vcf_header_file(file=file)) - 1
1068        elif self.get_header(type="list"):
1069            return len(self.get_header(type="list")) - 1
1070        else:
1071            return 0
1072
1073    def get_header_columns(self) -> str:
1074        """
1075        This function returns the header list of a VCF
1076
1077        :return: The length of the header list.
1078        """
1079        if self.get_header():
1080            return self.get_header(type="list")[-1]
1081        else:
1082            return ""
1083
1084    def get_header_columns_as_list(self) -> list:
1085        """
1086        This function returns the header list of a VCF
1087
1088        :return: The length of the header list.
1089        """
1090        if self.get_header():
1091            return self.get_header_columns().strip().split("\t")
1092        else:
1093            return []
1094
1095    def get_header_columns_as_sql(self) -> str:
1096        """
1097        This function retruns header length (without #CHROM line)
1098
1099        :return: The length of the header list.
1100        """
1101        sql_column_list = []
1102        for col in self.get_header_columns_as_list():
1103            sql_column_list.append(f'"{col}"')
1104        return ",".join(sql_column_list)
1105
1106    def get_header_sample_list(self) -> list:
1107        """
1108        This function retruns header length (without #CHROM line)
1109
1110        :return: The length of the header list.
1111        """
1112        return self.header_vcf.samples
1113
1114    def get_verbose(self) -> bool:
1115        """
1116        It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't
1117        exist
1118
1119        :return: The value of the key "verbose" in the config dictionary.
1120        """
1121        return self.get_config().get("verbose", False)
1122
1123    def get_connexion_format(self) -> str:
1124        """
1125        It returns the connexion format of the object.
1126        :return: The connexion_format is being returned.
1127        """
1128        connexion_format = self.connexion_format
1129        if connexion_format not in ["duckdb", "sqlite"]:
1130            log.error(f"Unknown connexion format {connexion_format}")
1131            raise ValueError(f"Unknown connexion format {connexion_format}")
1132        else:
1133            return connexion_format
1134
1135    def insert_file_to_table(
1136        self,
1137        file,
1138        columns: str,
1139        header_len: int = 0,
1140        sep: str = "\t",
1141        chunksize: int = 1000000,
1142    ) -> None:
1143        """
1144        The function reads a file in chunks and inserts each chunk into a table based on the specified
1145        database format.
1146
1147        :param file: The `file` parameter is the file that you want to load into a table. It should be
1148        the path to the file on your system
1149        :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that
1150        should contain the names of the columns in the table where the data will be inserted. The column
1151        names should be separated by commas within the string. For example, if you have columns named
1152        "id", "name
1153        :type columns: str
1154        :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies
1155        the number of lines to skip at the beginning of the file before reading the actual data. This
1156        parameter allows you to skip any header information present in the file before processing the
1157        data, defaults to 0
1158        :type header_len: int (optional)
1159        :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the
1160        separator character that is used in the file being read. In this case, the default separator is
1161        set to `\t`, which represents a tab character. You can change this parameter to a different
1162        separator character if, defaults to \t
1163        :type sep: str (optional)
1164        :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time
1165        when processing the file in chunks. In the provided code snippet, the default value for
1166        `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults
1167        to 1000000
1168        :type chunksize: int (optional)
1169        """
1170
1171        # Config
1172        chunksize = self.get_config().get("load", {}).get("chunk", chunksize)
1173        connexion_format = self.get_connexion_format()
1174
1175        log.debug("chunksize: " + str(chunksize))
1176
1177        if chunksize:
1178            for chunk in pd.read_csv(
1179                file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c"
1180            ):
1181                if connexion_format in ["duckdb"]:
1182                    sql_insert_into = (
1183                        f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk"
1184                    )
1185                    self.conn.execute(sql_insert_into)
1186                elif connexion_format in ["sqlite"]:
1187                    chunk.to_sql("variants", self.conn, if_exists="append", index=False)
1188
1189    def load_data(
1190        self,
1191        input_file: str = None,
1192        drop_variants_table: bool = False,
1193        sample_size: int = 20480,
1194    ) -> None:
1195        """
1196        The `load_data` function reads a VCF file and inserts it into a table, with options to drop the
1197        table before loading the data and specify a sample size.
1198
1199        :param input_file: The path to the input file. This is the VCF file that will be loaded into the
1200        table
1201        :type input_file: str
1202        :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that
1203        determines whether the variants table should be dropped before loading the data. If set to
1204        `True`, the variants table will be dropped. If set to `False` (default), the variants table will
1205        not be dropped, defaults to False
1206        :type drop_variants_table: bool (optional)
1207        :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from
1208        the input file. If it is set to `None`, the default value of 20480 will be used, defaults to
1209        20480
1210        :type sample_size: int (optional)
1211        """
1212
1213        log.info("Loading...")
1214
1215        # change input file
1216        if input_file:
1217            self.set_input(input_file)
1218            self.set_header()
1219
1220        # drop variants table
1221        if drop_variants_table:
1222            self.drop_variants_table()
1223
1224        # get table variants
1225        table_variants = self.get_table_variants()
1226
1227        # Access
1228        access = self.get_config().get("access", None)
1229        log.debug(f"access: {access}")
1230
1231        # Input format and compress
1232        input_format = self.get_input_format()
1233        input_compressed = self.get_input_compressed()
1234        log.debug(f"input_format: {input_format}")
1235        log.debug(f"input_compressed: {input_compressed}")
1236
1237        # input_compressed_format
1238        if input_compressed:
1239            input_compressed_format = "gzip"
1240        else:
1241            input_compressed_format = "none"
1242        log.debug(f"input_compressed_format: {input_compressed_format}")
1243
1244        # Connexion format
1245        connexion_format = self.get_connexion_format()
1246
1247        # Sample size
1248        if not sample_size:
1249            sample_size = -1
1250        log.debug(f"sample_size: {sample_size}")
1251
1252        # Load data
1253        log.debug(f"Load Data from {input_format}")
1254
1255        # DuckDB connexion
1256        if connexion_format in ["duckdb"]:
1257
1258            # Database already exists
1259            if self.input_format in ["db", "duckdb"]:
1260
1261                if connexion_format in ["duckdb"]:
1262                    log.debug(f"Input file format '{self.input_format}' duckDB")
1263                else:
1264                    log.error(
1265                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
1266                    )
1267                    raise ValueError(
1268                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
1269                    )
1270
1271            # Load from existing database format
1272            else:
1273
1274                try:
1275                    # Create Table or View
1276                    database = Database(database=self.input)
1277                    sql_from = database.get_sql_from(sample_size=sample_size)
1278
1279                    if access in ["RO"]:
1280                        sql_load = (
1281                            f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}"
1282                        )
1283                    else:
1284                        sql_load = (
1285                            f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}"
1286                        )
1287                    self.conn.execute(sql_load)
1288
1289                except:
1290                    # Format not available
1291                    log.error(f"Input file format '{self.input_format}' not available")
1292                    raise ValueError(
1293                        f"Input file format '{self.input_format}' not available"
1294                    )
1295
1296        # SQLite connexion
1297        elif connexion_format in ["sqlite"] and input_format in [
1298            "vcf",
1299            "tsv",
1300            "csv",
1301            "psv",
1302        ]:
1303
1304            # Main structure
1305            structure = {
1306                "#CHROM": "VARCHAR",
1307                "POS": "INTEGER",
1308                "ID": "VARCHAR",
1309                "REF": "VARCHAR",
1310                "ALT": "VARCHAR",
1311                "QUAL": "VARCHAR",
1312                "FILTER": "VARCHAR",
1313                "INFO": "VARCHAR",
1314            }
1315
1316            # Strcuture with samples
1317            structure_complete = structure
1318            if self.get_header_sample_list():
1319                structure["FORMAT"] = "VARCHAR"
1320                for sample in self.get_header_sample_list():
1321                    structure_complete[sample] = "VARCHAR"
1322
1323            # Columns list for create and insert
1324            sql_create_table_columns = []
1325            sql_create_table_columns_list = []
1326            for column in structure_complete:
1327                column_type = structure_complete[column]
1328                sql_create_table_columns.append(
1329                    f'"{column}" {column_type} default NULL'
1330                )
1331                sql_create_table_columns_list.append(f'"{column}"')
1332
1333            # Create database
1334            log.debug(f"Create Table {table_variants}")
1335            sql_create_table_columns_sql = ", ".join(sql_create_table_columns)
1336            sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list)
1337            sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})"
1338            self.conn.execute(sql_create_table)
1339
1340            # chunksize define length of file chunk load file
1341            chunksize = 100000
1342
1343            # delimiter
1344            delimiter = file_format_delimiters.get(input_format, "\t")
1345
1346            # Load the input file
1347            with open(self.input, "rt") as input_file:
1348
1349                # Use the appropriate file handler based on the input format
1350                if input_compressed:
1351                    input_file = bgzf.open(self.input, "rt")
1352                if input_format in ["vcf"]:
1353                    header_len = self.get_header_length()
1354                else:
1355                    header_len = 0
1356
1357                # Insert the file contents into a table
1358                self.insert_file_to_table(
1359                    input_file,
1360                    columns=sql_create_table_columns_list_sql,
1361                    header_len=header_len,
1362                    sep=delimiter,
1363                    chunksize=chunksize,
1364                )
1365
1366        else:
1367            log.error(
1368                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
1369            )
1370            raise ValueError(
1371                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
1372            )
1373
1374        # Explode INFOS fields into table fields
1375        if self.get_explode_infos():
1376            self.explode_infos(
1377                prefix=self.get_explode_infos_prefix(),
1378                fields=self.get_explode_infos_fields(),
1379                force=True,
1380            )
1381
1382        # Create index after insertion
1383        self.create_indexes()
1384
1385    def get_explode_infos(self) -> bool:
1386        """
1387        The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting
1388        to False if it is not set.
1389        :return: The method is returning the value of the "explode_infos" parameter, which is a boolean
1390        value. If the parameter is not present, it will return False.
1391        """
1392
1393        return self.get_param().get("explode", {}).get("explode_infos", False)
1394
1395    def get_explode_infos_fields(
1396        self,
1397        explode_infos_fields: str = None,
1398        remove_fields_not_in_header: bool = False,
1399    ) -> list:
1400        """
1401        The `get_explode_infos_fields` function returns a list of exploded information fields based on
1402        the input parameter `explode_infos_fields`.
1403
1404        :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the
1405        fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a
1406        comma-separated list of field names to explode
1407        :type explode_infos_fields: str
1408        :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean
1409        flag that determines whether to remove fields that are not present in the header. If it is set
1410        to `True`, any field that is not in the header will be excluded from the list of exploded
1411        information fields. If it is set to `, defaults to False
1412        :type remove_fields_not_in_header: bool (optional)
1413        :return: The function `get_explode_infos_fields` returns a list of exploded information fields.
1414        If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty
1415        list. If the parameter is provided and its value is "ALL", it also returns an empty list.
1416        Otherwise, it returns a list of exploded information fields after removing any spaces and
1417        splitting the string by commas.
1418        """
1419
1420        # If no fields, get it in param
1421        if not explode_infos_fields:
1422            explode_infos_fields = (
1423                self.get_param().get("explode", {}).get("explode_infos_fields", None)
1424            )
1425
1426        # If no fields, defined as all fields in header using keyword
1427        if not explode_infos_fields:
1428            explode_infos_fields = "*"
1429
1430        # If fields list not empty
1431        if explode_infos_fields:
1432
1433            # Input fields list
1434            if isinstance(explode_infos_fields, str):
1435                fields_input = explode_infos_fields.split(",")
1436            elif isinstance(explode_infos_fields, list):
1437                fields_input = explode_infos_fields
1438            else:
1439                fields_input = []
1440
1441            # Fields list without * keyword
1442            fields_without_all = fields_input.copy()
1443            if "*".casefold() in (item.casefold() for item in fields_without_all):
1444                fields_without_all.remove("*")
1445
1446            # Fields in header
1447            fields_in_header = sorted(list(set(self.get_header().infos)))
1448
1449            # Construct list of fields
1450            fields_output = []
1451            for field in fields_input:
1452
1453                # Strip field
1454                field = field.strip()
1455
1456                # format keyword * in regex
1457                if field.upper() in ["*"]:
1458                    field = ".*"
1459
1460                # Find all fields with pattern
1461                r = re.compile(field)
1462                fields_search = sorted(list(filter(r.match, fields_in_header)))
1463
1464                # Remove fields input from search
1465                if fields_search != [field]:
1466                    fields_search = sorted(
1467                        list(set(fields_search).difference(fields_input))
1468                    )
1469
1470                # If field is not in header (avoid not well formatted header)
1471                if not fields_search and not remove_fields_not_in_header:
1472                    fields_search = [field]
1473
1474                # Add found fields
1475                for new_field in fields_search:
1476                    # Add field, if not already exists, and if it is in header (if asked)
1477                    if (
1478                        new_field not in fields_output
1479                        and (
1480                            not remove_fields_not_in_header
1481                            or new_field in fields_in_header
1482                        )
1483                        and new_field not in [".*"]
1484                    ):
1485                        fields_output.append(new_field)
1486
1487            return fields_output
1488
1489        else:
1490
1491            return []
1492
1493    def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str:
1494        """
1495        The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or
1496        the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is
1497        not provided.
1498
1499        :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a
1500        prefix to be used for exploding or expanding information
1501        :type explode_infos_prefix: str
1502        :return: the value of the variable `explode_infos_prefix`.
1503        """
1504
1505        if not explode_infos_prefix:
1506            explode_infos_prefix = (
1507                self.get_param().get("explode", {}).get("explode_infos_prefix", "")
1508            )
1509
1510        return explode_infos_prefix
1511
1512    def add_column(
1513        self,
1514        table_name,
1515        column_name,
1516        column_type,
1517        default_value=None,
1518        drop: bool = False,
1519    ) -> dict:
1520        """
1521        The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it
1522        doesn't already exist.
1523
1524        :param table_name: The name of the table to which you want to add a column
1525        :param column_name: The parameter "column_name" is the name of the column that you want to add
1526        to the table
1527        :param column_type: The `column_type` parameter specifies the data type of the column that you
1528        want to add to the table. It should be a string that represents the desired data type, such as
1529        "INTEGER", "TEXT", "REAL", etc
1530        :param default_value: The `default_value` parameter is an optional parameter that specifies the
1531        default value for the newly added column. If a default value is provided, it will be assigned to
1532        the column for any existing rows that do not have a value for that column
1533        :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column
1534        if it already exists in the table. If `drop` is set to `True`, the function will drop the
1535        existing column before adding the new column. If `drop` is set to `False` (default),, defaults
1536        to False
1537        :type drop: bool (optional)
1538        :return: a boolean value indicating whether the column was successfully added to the table.
1539        """
1540
1541        # added
1542        added = False
1543        dropped = False
1544
1545        # Check if the column already exists in the table
1546        query = f""" SELECT * FROM {table_name} LIMIT 0 """
1547        columns = self.get_query_to_df(query).columns.tolist()
1548        if column_name in columns:
1549            log.debug(
1550                f"The {column_name} column already exists in the {table_name} table"
1551            )
1552            if drop:
1553                self.drop_column(table_name=table_name, column_name=column_name)
1554                dropped = True
1555            else:
1556                return None
1557        else:
1558            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
1559
1560        # Add column in table
1561        add_column_query = (
1562            f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """
1563        )
1564        if default_value is not None:
1565            add_column_query += f" DEFAULT {default_value}"
1566        self.execute_query(add_column_query)
1567        added = not dropped
1568        log.debug(
1569            f"The {column_name} column was successfully added to the {table_name} table"
1570        )
1571
1572        if added:
1573            added_column = {
1574                "table_name": table_name,
1575                "column_name": column_name,
1576                "column_type": column_type,
1577                "default_value": default_value,
1578            }
1579        else:
1580            added_column = None
1581
1582        return added_column
1583
1584    def drop_column(
1585        self, column: dict = None, table_name: str = None, column_name: str = None
1586    ) -> bool:
1587        """
1588        The `drop_column` function drops a specified column from a given table in a database and returns
1589        True if the column was successfully dropped, and False if the column does not exist in the
1590        table.
1591
1592        :param column: The `column` parameter is a dictionary that contains information about the column
1593        you want to drop. It has two keys:
1594        :type column: dict
1595        :param table_name: The `table_name` parameter is the name of the table from which you want to
1596        drop a column
1597        :type table_name: str
1598        :param column_name: The `column_name` parameter is the name of the column that you want to drop
1599        from the table
1600        :type column_name: str
1601        :return: a boolean value. It returns True if the column was successfully dropped from the table,
1602        and False if the column does not exist in the table.
1603        """
1604
1605        # Find column infos
1606        if column:
1607            if isinstance(column, dict):
1608                table_name = column.get("table_name", None)
1609                column_name = column.get("column_name", None)
1610            elif isinstance(column, str):
1611                table_name = self.get_table_variants()
1612                column_name = column
1613            else:
1614                table_name = None
1615                column_name = None
1616
1617        if not table_name and not column_name:
1618            return False
1619
1620        # Removed
1621        removed = False
1622
1623        # Check if the column already exists in the table
1624        query = f""" SELECT * FROM {table_name} LIMIT 0 """
1625        columns = self.get_query_to_df(query).columns.tolist()
1626        if column_name in columns:
1627            log.debug(f"The {column_name} column exists in the {table_name} table")
1628        else:
1629            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
1630            return False
1631
1632        # Add column in table # ALTER TABLE integers DROP k
1633        add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """
1634        self.execute_query(add_column_query)
1635        removed = True
1636        log.debug(
1637            f"The {column_name} column was successfully dropped to the {table_name} table"
1638        )
1639
1640        return removed
1641
1642    def explode_infos(
1643        self,
1644        prefix: str = None,
1645        create_index: bool = False,
1646        fields: list = None,
1647        force: bool = False,
1648        proccess_all_fields_together: bool = False,
1649    ) -> list:
1650        """
1651        The `explode_infos` function takes a VCF file and explodes the INFO fields into individual
1652        columns, returning a list of added columns.
1653
1654        :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO
1655        fields. If the `prefix` is not provided or is set to `None`, the function will use the value of
1656        `self.get_explode_infos_prefix()` as the prefix
1657        :type prefix: str
1658        :param create_index: The `create_index` parameter is a boolean flag that specifies whether to
1659        create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to
1660        `False`, indexes will not be created. The default value is `False`, defaults to False
1661        :type create_index: bool (optional)
1662        :param fields: The `fields` parameter is a list of INFO fields that you want to explode into
1663        individual columns. If this parameter is not provided, all INFO fields will be exploded
1664        :type fields: list
1665        :param force: The `force` parameter is a boolean flag that determines whether to drop and
1666        recreate the column if it already exists in the table. If `force` is set to `True`, the column
1667        will be dropped and recreated. If `force` is set to `False`, the column will not be dropped,
1668        defaults to False
1669        :type force: bool (optional)
1670        :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean
1671        flag that determines whether to process all the INFO fields together or individually. If set to
1672        `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will
1673        be processed individually, defaults to False
1674        :type proccess_all_fields_together: bool (optional)
1675        :return: The function `explode_infos` returns a list of added columns.
1676        """
1677
1678        # drop indexes
1679        self.drop_indexes()
1680
1681        # connexion format
1682        connexion_format = self.get_connexion_format()
1683
1684        # Access
1685        access = self.get_config().get("access", None)
1686
1687        # Added columns
1688        added_columns = []
1689
1690        if access not in ["RO"]:
1691
1692            # prefix
1693            if prefix in [None, True] or not isinstance(prefix, str):
1694                if self.get_explode_infos_prefix() not in [None, True]:
1695                    prefix = self.get_explode_infos_prefix()
1696                else:
1697                    prefix = "INFO/"
1698
1699            # table variants
1700            table_variants = self.get_table_variants(clause="select")
1701
1702            # extra infos
1703            try:
1704                extra_infos = self.get_extra_infos()
1705            except:
1706                extra_infos = []
1707
1708            # Header infos
1709            header_infos = self.get_header().infos
1710
1711            log.debug(
1712                f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields"
1713            )
1714
1715            sql_info_alter_table_array = []
1716
1717            # Info fields to check
1718            fields_list = list(header_infos)
1719            if fields:
1720                fields_list += fields
1721            fields_list = set(fields_list)
1722
1723            # If no fields
1724            if not fields:
1725                fields = []
1726
1727            # Translate fields if patterns
1728            fields = self.get_explode_infos_fields(explode_infos_fields=fields)
1729
1730            for info in fields:
1731
1732                info_id_sql = prefix + info
1733
1734                if (
1735                    info in fields_list
1736                    or prefix + info in fields_list
1737                    or info in extra_infos
1738                ):
1739
1740                    log.debug(f"Explode INFO fields - ADD '{info}' annotations fields")
1741
1742                    if info in header_infos:
1743                        info_type = header_infos[info].type
1744                        info_num = header_infos[info].num
1745                    else:
1746                        info_type = "String"
1747                        info_num = 0
1748
1749                    type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR")
1750                    if info_num != 1:
1751                        type_sql = "VARCHAR"
1752
1753                    # Add field
1754                    added_column = self.add_column(
1755                        table_name=table_variants,
1756                        column_name=info_id_sql,
1757                        column_type=type_sql,
1758                        default_value="null",
1759                        drop=force,
1760                    )
1761
1762                    if added_column:
1763                        added_columns.append(added_column)
1764
1765                    if added_column or force:
1766
1767                        # add field to index
1768                        self.index_additionnal_fields.append(info_id_sql)
1769
1770                        # Update field array
1771                        if connexion_format in ["duckdb"]:
1772                            update_info_field = f"""
1773                            "{info_id_sql}" =
1774                                CASE
1775                                    WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL
1776                                    ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1)
1777                                END
1778                            """
1779                        elif connexion_format in ["sqlite"]:
1780                            update_info_field = f"""
1781                                "{info_id_sql}" =
1782                                    CASE
1783                                        WHEN instr(INFO, '{info}=') = 0 THEN NULL
1784                                        WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1)
1785                                        ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1)
1786                                    END
1787                            """
1788
1789                        sql_info_alter_table_array.append(update_info_field)
1790
1791            if sql_info_alter_table_array:
1792
1793                # By chromosomes
1794                try:
1795                    chromosomes_list = list(
1796                        self.get_query_to_df(
1797                            f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """
1798                        )["#CHROM"]
1799                    )
1800                except:
1801                    chromosomes_list = [None]
1802
1803                for chrom in chromosomes_list:
1804                    log.debug(f"Explode INFO fields - Chromosome {chrom}...")
1805
1806                    # Where clause
1807                    where_clause = ""
1808                    if chrom and len(chromosomes_list) > 1:
1809                        where_clause = f""" WHERE "#CHROM" = '{chrom}' """
1810
1811                    # Update table
1812                    if proccess_all_fields_together:
1813                        sql_info_alter_table_array_join = ", ".join(
1814                            sql_info_alter_table_array
1815                        )
1816                        if sql_info_alter_table_array_join:
1817                            sql_info_alter_table = f"""
1818                                UPDATE {table_variants}
1819                                SET {sql_info_alter_table_array_join}
1820                                {where_clause}
1821                                """
1822                            log.debug(
1823                                f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..."
1824                            )
1825                            # log.debug(sql_info_alter_table)
1826                            self.conn.execute(sql_info_alter_table)
1827                    else:
1828                        sql_info_alter_num = 0
1829                        for sql_info_alter in sql_info_alter_table_array:
1830                            sql_info_alter_num += 1
1831                            sql_info_alter_table = f"""
1832                                UPDATE {table_variants}
1833                                SET {sql_info_alter}
1834                                {where_clause}
1835                                """
1836                            log.debug(
1837                                f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..."
1838                            )
1839                            # log.debug(sql_info_alter_table)
1840                            self.conn.execute(sql_info_alter_table)
1841
1842        # create indexes
1843        if create_index:
1844            self.create_indexes()
1845
1846        return added_columns
1847
1848    def create_indexes(self) -> None:
1849        """
1850        Create indexes on the table after insertion
1851        """
1852
1853        # Access
1854        access = self.get_config().get("access", None)
1855
1856        # get table variants
1857        table_variants = self.get_table_variants("FROM")
1858
1859        if self.get_indexing() and access not in ["RO"]:
1860            # Create index
1861            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")'
1862            self.conn.execute(sql_create_table_index)
1863            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")'
1864            self.conn.execute(sql_create_table_index)
1865            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")'
1866            self.conn.execute(sql_create_table_index)
1867            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")'
1868            self.conn.execute(sql_create_table_index)
1869            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")'
1870            self.conn.execute(sql_create_table_index)
1871            for field in self.index_additionnal_fields:
1872                sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """
1873                self.conn.execute(sql_create_table_index)
1874
1875    def drop_indexes(self) -> None:
1876        """
1877        Create indexes on the table after insertion
1878        """
1879
1880        # Access
1881        access = self.get_config().get("access", None)
1882
1883        # get table variants
1884        table_variants = self.get_table_variants("FROM")
1885
1886        # Get database format
1887        connexion_format = self.get_connexion_format()
1888
1889        if access not in ["RO"]:
1890            if connexion_format in ["duckdb"]:
1891                sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'"
1892            elif connexion_format in ["sqlite"]:
1893                sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';"
1894
1895            list_indexes = self.conn.execute(sql_list_indexes)
1896            index_names = [row[0] for row in list_indexes.fetchall()]
1897            for index in index_names:
1898                sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """
1899                self.conn.execute(sql_drop_table_index)
1900
1901    def read_vcf_header(self, f) -> list:
1902        """
1903        It reads the header of a VCF file and returns a list of the header lines
1904
1905        :param f: the file object
1906        :return: The header lines of the VCF file.
1907        """
1908
1909        header_list = []
1910        for line in f:
1911            header_list.append(line)
1912            if line.startswith("#CHROM"):
1913                break
1914        return header_list
1915
1916    def read_vcf_header_file(self, file: str = None) -> list:
1917        """
1918        The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and
1919        uncompressed files.
1920
1921        :param file: The `file` parameter is a string that represents the path to the VCF header file
1922        that you want to read. It is an optional parameter, so if you don't provide a value, it will
1923        default to `None`
1924        :type file: str
1925        :return: The function `read_vcf_header_file` returns a list.
1926        """
1927
1928        if self.get_input_compressed(input_file=file):
1929            with bgzf.open(file, "rt") as f:
1930                return self.read_vcf_header(f=f)
1931        else:
1932            with open(file, "rt") as f:
1933                return self.read_vcf_header(f=f)
1934
1935    def execute_query(self, query: str):
1936        """
1937        It takes a query as an argument, executes it, and returns the results
1938
1939        :param query: The query to be executed
1940        :return: The result of the query is being returned.
1941        """
1942        if query:
1943            return self.conn.execute(query)  # .fetchall()
1944        else:
1945            return None
1946
1947    def export_output(
1948        self,
1949        output_file: str | None = None,
1950        output_header: str | None = None,
1951        export_header: bool = True,
1952        query: str | None = None,
1953        parquet_partitions: list | None = None,
1954        chunk_size: int | None = None,
1955        threads: int | None = None,
1956        sort: bool = False,
1957        index: bool = False,
1958        order_by: str | None = None,
1959    ) -> bool:
1960        """
1961        The `export_output` function exports data from a VCF file to a specified output file in various
1962        formats, including VCF, CSV, TSV, PSV, and Parquet.
1963
1964        :param output_file: The `output_file` parameter is a string that specifies the name of the
1965        output file to be generated by the function. This is where the exported data will be saved
1966        :type output_file: str
1967        :param output_header: The `output_header` parameter is a string that specifies the name of the
1968        file where the header of the VCF file will be exported. If this parameter is not provided, the
1969        header will be exported to a file with the same name as the `output_file` parameter, but with
1970        the extension "
1971        :type output_header: str
1972        :param export_header: The `export_header` parameter is a boolean flag that determines whether
1973        the header of a VCF file should be exported to a separate file or not. If `export_header` is
1974        True, the header will be exported to a file. If `export_header` is False, the header will not
1975        be, defaults to True, if output format is not VCF
1976        :type export_header: bool (optional)
1977        :param query: The `query` parameter is an optional SQL query that can be used to filter and
1978        select specific data from the VCF file before exporting it. If provided, only the data that
1979        matches the query will be exported
1980        :type query: str
1981        :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the
1982        columns to be used for partitioning the Parquet file during export. Partitioning is a way to
1983        organize data in a hierarchical directory structure based on the values of one or more columns.
1984        This can improve query performance when working with large datasets
1985        :type parquet_partitions: list
1986        :param chunk_size: The `chunk_size` parameter specifies the number of
1987        records in batch when exporting data in Parquet format. This parameter is used for
1988        partitioning the Parquet file into multiple files.
1989        :type chunk_size: int
1990        :param threads: The `threads` parameter is an optional parameter that specifies the number of
1991        threads to be used during the export process. It determines the level of parallelism and can
1992        improve the performance of the export operation. If not provided, the function will use the
1993        default number of threads
1994        :type threads: int
1995        :param sort: The `sort` parameter is a boolean flag that determines whether the output file
1996        should be sorted or not. If `sort` is set to `True`, the output file will be sorted based on the
1997        genomic coordinates of the variants. By default, the value of `sort` is `False`, defaults to
1998        False
1999        :type sort: bool (optional)
2000        :param index: The `index` parameter is a boolean flag that determines whether an index should be
2001        created on the output file. If `index` is True, an index will be created. If `index` is False,
2002        no index will be created. The default value is False, defaults to False
2003        :type index: bool (optional)
2004        :param order_by: The `order_by` parameter is a string that specifies the column(s) to use for
2005        sorting the output file. This parameter is only applicable when exporting data in VCF format
2006        :type order_by: str
2007        :return: a boolean value. It checks if the output file exists and returns True if it does, or
2008        None if it doesn't.
2009        """
2010
2011        # Log
2012        log.info("Exporting...")
2013
2014        # Full path
2015        output_file = full_path(output_file)
2016        output_header = full_path(output_header)
2017
2018        # Config
2019        config = self.get_config()
2020
2021        # Param
2022        param = self.get_param()
2023
2024        # Tmp files to remove
2025        tmp_to_remove = []
2026
2027        # If no output, get it
2028        if not output_file:
2029            output_file = self.get_output()
2030
2031        # If not threads
2032        if not threads:
2033            threads = self.get_threads()
2034
2035        # Auto header name with extension
2036        if export_header or output_header:
2037            if not output_header:
2038                output_header = f"{output_file}.hdr"
2039            # Export header
2040            self.export_header(output_file=output_file)
2041
2042        # Switch off export header if VCF output
2043        output_file_type = get_file_format(output_file)
2044        if output_file_type in ["vcf"]:
2045            export_header = False
2046            tmp_to_remove.append(output_header)
2047
2048        # Chunk size
2049        if not chunk_size:
2050            chunk_size = config.get("chunk_size", None)
2051
2052        # Parquet partition
2053        if not parquet_partitions:
2054            parquet_partitions = param.get("export", {}).get("parquet_partitions", None)
2055        if parquet_partitions and isinstance(parquet_partitions, str):
2056            parquet_partitions = parquet_partitions.split(",")
2057
2058        # Order by
2059        if not order_by:
2060            order_by = param.get("export", {}).get("order_by", "")
2061
2062        # Header in output
2063        header_in_output = param.get("export", {}).get("include_header", False)
2064
2065        # Database
2066        database_source = self.get_connexion()
2067
2068        # Connexion format
2069        connexion_format = self.get_connexion_format()
2070
2071        # Explode infos
2072        if self.get_explode_infos():
2073            self.explode_infos(
2074                prefix=self.get_explode_infos_prefix(),
2075                fields=self.get_explode_infos_fields(),
2076                force=False,
2077            )
2078
2079        # if connexion_format in ["sqlite"] or query:
2080        if connexion_format in ["sqlite"]:
2081
2082            # Export in Parquet
2083            random_tmp = "".join(
2084                random.choice(string.ascii_lowercase) for i in range(10)
2085            )
2086            database_source = f"""{output_file}.{random_tmp}.database_export.parquet"""
2087            tmp_to_remove.append(database_source)
2088
2089            # Table Variants
2090            table_variants = self.get_table_variants()
2091
2092            # Create export query
2093            sql_query_export_subquery = f"""
2094                SELECT * FROM {table_variants}
2095                """
2096
2097            # Write source file
2098            fp.write(database_source, self.get_query_to_df(sql_query_export_subquery))
2099
2100        # Create database
2101        database = Database(
2102            database=database_source,
2103            table="variants",
2104            header_file=output_header,
2105            conn_config=self.get_connexion_config(),
2106        )
2107
2108        # Existing colomns header
2109        # existing_columns_header = database.get_header_file_columns(output_header)
2110        existing_columns_header = database.get_header_columns_from_database()
2111
2112        # Export file
2113        database.export(
2114            output_database=output_file,
2115            output_header=output_header,
2116            existing_columns_header=existing_columns_header,
2117            parquet_partitions=parquet_partitions,
2118            chunk_size=chunk_size,
2119            threads=threads,
2120            sort=sort,
2121            index=index,
2122            header_in_output=header_in_output,
2123            order_by=order_by,
2124            query=query,
2125            export_header=export_header,
2126        )
2127
2128        # Remove
2129        remove_if_exists(tmp_to_remove)
2130
2131        return (os.path.exists(output_file) or None) and (
2132            os.path.exists(output_file) or None
2133        )
2134
2135    def get_extra_infos(self, table: str = None) -> list:
2136        """
2137        The `get_extra_infos` function returns a list of columns that are in a specified table but not
2138        in the header.
2139
2140        :param table: The `table` parameter in the `get_extra_infos` function is used to specify the
2141        name of the table from which you want to retrieve the extra columns that are not present in the
2142        header. If the `table` parameter is not provided when calling the function, it will default to
2143        using the variants
2144        :type table: str
2145        :return: A list of columns that are in the specified table but not in the header of the table.
2146        """
2147
2148        header_columns = []
2149
2150        if not table:
2151            table = self.get_table_variants(clause="from")
2152            header_columns = self.get_header_columns()
2153
2154        # Check all columns in the database
2155        query = f""" SELECT * FROM {table} LIMIT 1 """
2156        log.debug(f"query {query}")
2157        table_columns = self.get_query_to_df(query).columns.tolist()
2158        extra_columns = []
2159
2160        # Construct extra infos (not in header)
2161        for column in table_columns:
2162            if column not in header_columns:
2163                extra_columns.append(column)
2164
2165        return extra_columns
2166
2167    def get_extra_infos_sql(self, table: str = None) -> str:
2168        """
2169        It returns a string of the extra infos, separated by commas, and each extra info is surrounded
2170        by double quotes
2171
2172        :param table: The name of the table to get the extra infos from. If None, the default table is
2173        used
2174        :type table: str
2175        :return: A string of the extra infos
2176        """
2177
2178        return ", ".join(
2179            ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)]
2180        )
2181
2182    def export_header(
2183        self,
2184        header_name: str = None,
2185        output_file: str = None,
2186        output_file_ext: str = ".hdr",
2187        clean_header: bool = True,
2188        remove_chrom_line: bool = False,
2189    ) -> str:
2190        """
2191        The `export_header` function takes a VCF file, extracts the header, modifies it according to
2192        specified options, and writes it to a new file.
2193
2194        :param header_name: The `header_name` parameter is the name of the header file to be created. If
2195        this parameter is not specified, the header will be written to the output file
2196        :type header_name: str
2197        :param output_file: The `output_file` parameter in the `export_header` function is used to
2198        specify the name of the output file where the header will be written. If this parameter is not
2199        provided, the header will be written to a temporary file
2200        :type output_file: str
2201        :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a
2202        string that represents the extension of the output header file. By default, it is set to ".hdr"
2203        if not specified by the user. This extension will be appended to the `output_file` name to
2204        create the final, defaults to .hdr
2205        :type output_file_ext: str (optional)
2206        :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean
2207        flag that determines whether the header should be cleaned or not. When `clean_header` is set to
2208        `True`, the function will clean the header by modifying certain lines based on a specific
2209        pattern. If `clean_header`, defaults to True
2210        :type clean_header: bool (optional)
2211        :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a
2212        boolean flag that determines whether the #CHROM line should be removed from the header before
2213        writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `,
2214        defaults to False
2215        :type remove_chrom_line: bool (optional)
2216        :return: The function `export_header` returns the name of the temporary header file that is
2217        created.
2218        """
2219
2220        if not header_name and not output_file:
2221            output_file = self.get_output()
2222
2223        if self.get_header():
2224
2225            # Get header object
2226            header_obj = self.get_header()
2227
2228            # Create database
2229            db_for_header = Database(database=self.get_input())
2230
2231            # Get real columns in the file
2232            db_header_columns = db_for_header.get_columns()
2233
2234            with tempfile.TemporaryDirectory() as tmpdir:
2235
2236                # Write header file
2237                header_file_tmp = os.path.join(tmpdir, "header")
2238                f = open(header_file_tmp, "w")
2239                vcf.Writer(f, header_obj)
2240                f.close()
2241
2242                # Replace #CHROM line with rel columns
2243                header_list = db_for_header.read_header_file(
2244                    header_file=header_file_tmp
2245                )
2246                header_list[-1] = "\t".join(db_header_columns)
2247
2248                # Remove CHROM line
2249                if remove_chrom_line:
2250                    header_list.pop()
2251
2252                # Clean header
2253                if clean_header:
2254                    header_list_clean = []
2255                    for head in header_list:
2256                        # Clean head for malformed header
2257                        head_clean = head
2258                        head_clean = re.subn(
2259                            "##FORMAT=<ID=(.*),Number=(.*),Type=Flag",
2260                            r"##FORMAT=<ID=\1,Number=\2,Type=String",
2261                            head_clean,
2262                            2,
2263                        )[0]
2264                        # Write header
2265                        header_list_clean.append(head_clean)
2266                    header_list = header_list_clean
2267
2268            tmp_header_name = output_file + output_file_ext
2269
2270            f = open(tmp_header_name, "w")
2271            for line in header_list:
2272                f.write(line)
2273            f.close()
2274
2275        return tmp_header_name
2276
2277    def export_variant_vcf(
2278        self,
2279        vcf_file,
2280        remove_info: bool = False,
2281        add_samples: bool = True,
2282        list_samples: list = [],
2283        where_clause: str = "",
2284        index: bool = False,
2285        threads: int | None = None,
2286    ) -> bool | None:
2287        """
2288        The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to
2289        remove INFO field, add samples, and control compression and indexing.
2290
2291        :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be
2292        written to. It is the output file that will contain the filtered VCF data based on the specified
2293        parameters
2294        :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a
2295        boolean flag that determines whether to remove the INFO field from the output VCF file. If set
2296        to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included
2297        in, defaults to False
2298        :type remove_info: bool (optional)
2299        :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether
2300        the samples should be added to the VCF file or not. If set to True, the samples will be added.
2301        If set to False, the samples will be removed. The default value is True, defaults to True
2302        :type add_samples: bool (optional)
2303        :param list_samples: The `list_samples` parameter is a list of samples that you want to include
2304        in the output VCF file. By default, all samples will be included. If you provide a list of
2305        samples, only those samples will be included in the output file
2306        :type list_samples: list
2307        :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that
2308        determines whether or not to create an index for the output VCF file. If `index` is set to
2309        `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False
2310        :type index: bool (optional)
2311        :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the
2312        number of threads to use for exporting the VCF file. It determines how many parallel threads
2313        will be used during the export process. More threads can potentially speed up the export process
2314        by utilizing multiple cores of the processor. If
2315        :type threads: int | None
2316        :return: The `export_variant_vcf` function returns the result of calling the `export_output`
2317        method with various parameters including the output file, query, threads, sort flag, and index
2318        flag. The `export_output` method is responsible for exporting the VCF data based on the
2319        specified parameters and configurations provided in the `export_variant_vcf` function.
2320        """
2321
2322        # Config
2323        config = self.get_config()
2324
2325        # Extract VCF
2326        log.debug("Export VCF...")
2327
2328        # Table variants
2329        table_variants = self.get_table_variants()
2330
2331        # Threads
2332        if not threads:
2333            threads = self.get_threads()
2334
2335        # Info fields
2336        if remove_info:
2337            if not isinstance(remove_info, str):
2338                remove_info = "."
2339            info_field = f"""'{remove_info}' as INFO"""
2340        else:
2341            info_field = "INFO"
2342
2343        # Samples fields
2344        if add_samples:
2345            if not list_samples:
2346                list_samples = self.get_header_sample_list()
2347            if list_samples:
2348                samples_fields = " , FORMAT , " + " , ".join(list_samples)
2349            else:
2350                samples_fields = ""
2351            log.debug(f"samples_fields: {samples_fields}")
2352        else:
2353            samples_fields = ""
2354
2355        # Where clause
2356        if where_clause is None:
2357            where_clause = ""
2358
2359        # Variants
2360        select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """
2361        sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """
2362        log.debug(f"sql_query_select={sql_query_select}")
2363
2364        return self.export_output(
2365            output_file=vcf_file,
2366            output_header=None,
2367            export_header=True,
2368            query=sql_query_select,
2369            parquet_partitions=None,
2370            chunk_size=config.get("chunk_size", None),
2371            threads=threads,
2372            sort=True,
2373            index=index,
2374            order_by=None,
2375        )
2376
2377    def run_commands(self, commands: list = [], threads: int = 1) -> None:
2378        """
2379        It takes a list of commands and runs them in parallel using the number of threads specified
2380
2381        :param commands: A list of commands to run
2382        :param threads: The number of threads to use, defaults to 1 (optional)
2383        """
2384
2385        run_parallel_commands(commands, threads)
2386
2387    def get_threads(self, default: int = 1) -> int:
2388        """
2389        This function returns the number of threads to use for a job, with a default value of 1 if not
2390        specified.
2391
2392        :param default: The `default` parameter in the `get_threads` method is used to specify the
2393        default number of threads to use if no specific value is provided. If no value is provided for
2394        the `threads` parameter in the configuration or input parameters, the `default` value will be
2395        used, defaults to 1
2396        :type default: int (optional)
2397        :return: the number of threads to use for the current job.
2398        """
2399
2400        # Config
2401        config = self.get_config()
2402
2403        # Param
2404        param = self.get_param()
2405
2406        # Input threads
2407        input_thread = param.get("threads", config.get("threads", None))
2408
2409        # Check threads
2410        if not input_thread:
2411            threads = default
2412        elif int(input_thread) <= 0:
2413            threads = os.cpu_count()
2414        else:
2415            threads = int(input_thread)
2416        return threads
2417
2418    def get_memory(self, default: str = None) -> str:
2419        """
2420        This function retrieves the memory value from parameters or configuration with a default value
2421        if not found.
2422
2423        :param default: The `get_memory` function takes in a default value as a string parameter. This
2424        default value is used as a fallback in case the `memory` parameter is not provided in the
2425        `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary,
2426        the function
2427        :type default: str
2428        :return: The `get_memory` function returns a string value representing the memory parameter. If
2429        the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will
2430        return the default value provided as an argument to the function.
2431        """
2432
2433        # Config
2434        config = self.get_config()
2435
2436        # Param
2437        param = self.get_param()
2438
2439        # Input threads
2440        input_memory = param.get("memory", config.get("memory", None))
2441
2442        # Check threads
2443        if input_memory:
2444            memory = input_memory
2445        else:
2446            memory = default
2447
2448        return memory
2449
2450    def update_from_vcf(self, vcf_file: str) -> None:
2451        """
2452        > If the database is duckdb, then use the parquet method, otherwise use the sqlite method
2453
2454        :param vcf_file: the path to the VCF file
2455        """
2456
2457        connexion_format = self.get_connexion_format()
2458
2459        if connexion_format in ["duckdb"]:
2460            self.update_from_vcf_duckdb(vcf_file)
2461        elif connexion_format in ["sqlite"]:
2462            self.update_from_vcf_sqlite(vcf_file)
2463
2464    def update_from_vcf_duckdb(self, vcf_file: str) -> None:
2465        """
2466        It takes a VCF file and updates the INFO column of the variants table in the database with the
2467        INFO column of the VCF file
2468
2469        :param vcf_file: the path to the VCF file
2470        """
2471
2472        # varaints table
2473        table_variants = self.get_table_variants()
2474
2475        # Loading VCF into temporaire table
2476        skip = self.get_header_length(file=vcf_file)
2477        vcf_df = pd.read_csv(
2478            vcf_file,
2479            sep="\t",
2480            engine="c",
2481            skiprows=skip,
2482            header=0,
2483            low_memory=False,
2484        )
2485        sql_query_update = f"""
2486        UPDATE {table_variants} as table_variants
2487            SET INFO = concat(
2488                            CASE
2489                                WHEN INFO NOT IN ('', '.')
2490                                THEN INFO
2491                                ELSE ''
2492                            END,
2493                            (
2494                                SELECT 
2495                                    concat(
2496                                        CASE
2497                                            WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.')
2498                                            THEN ';'
2499                                            ELSE ''
2500                                        END
2501                                        ,
2502                                        CASE
2503                                            WHEN table_parquet.INFO NOT IN ('','.')
2504                                            THEN table_parquet.INFO
2505                                            ELSE ''
2506                                        END
2507                                    )
2508                                FROM vcf_df as table_parquet
2509                                        WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR)
2510                                        AND table_parquet.\"POS\" = table_variants.\"POS\"
2511                                        AND table_parquet.\"ALT\" = table_variants.\"ALT\"
2512                                        AND table_parquet.\"REF\" = table_variants.\"REF\"
2513                                        AND table_parquet.INFO NOT IN ('','.')
2514                            )
2515                        )
2516            ;
2517            """
2518        self.conn.execute(sql_query_update)
2519
2520    def update_from_vcf_sqlite(self, vcf_file: str) -> None:
2521        """
2522        It creates a temporary table in the SQLite database, loads the VCF file into the temporary
2523        table, then updates the INFO column of the variants table with the INFO column of the temporary
2524        table
2525
2526        :param vcf_file: The path to the VCF file you want to update the database with
2527        """
2528
2529        # Create a temporary table for the VCF
2530        table_vcf = "tmp_vcf"
2531        sql_create = (
2532            f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0"
2533        )
2534        self.conn.execute(sql_create)
2535
2536        # Loading VCF into temporaire table
2537        vcf_df = pd.read_csv(
2538            vcf_file, sep="\t", comment="#", header=None, low_memory=False
2539        )
2540        vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]
2541        vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False)
2542
2543        # Update table 'variants' with VCF data
2544        # warning: CONCAT as || operator
2545        sql_query_update = f"""
2546            UPDATE variants as table_variants
2547            SET INFO = CASE
2548                            WHEN INFO NOT IN ('', '.')
2549                            THEN INFO
2550                            ELSE ''
2551                        END ||
2552                        (
2553                        SELECT 
2554                            CASE 
2555                                WHEN table_variants.INFO NOT IN ('','.') 
2556                                    AND table_vcf.INFO NOT IN ('','.')  
2557                                THEN ';' 
2558                                ELSE '' 
2559                            END || 
2560                            CASE 
2561                                WHEN table_vcf.INFO NOT IN ('','.') 
2562                                THEN table_vcf.INFO 
2563                                ELSE '' 
2564                            END
2565                        FROM {table_vcf} as table_vcf
2566                        WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\"
2567                            AND table_vcf.\"POS\" = table_variants.\"POS\"
2568                            AND table_vcf.\"ALT\" = table_variants.\"ALT\"
2569                            AND table_vcf.\"REF\" = table_variants.\"REF\"
2570                        )
2571        """
2572        self.conn.execute(sql_query_update)
2573
2574        # Drop temporary table
2575        sql_drop = f"DROP TABLE {table_vcf}"
2576        self.conn.execute(sql_drop)
2577
2578    def drop_variants_table(self) -> None:
2579        """
2580        > This function drops the variants table
2581        """
2582
2583        table_variants = self.get_table_variants()
2584        sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}"
2585        self.conn.execute(sql_table_variants)
2586
2587    def set_variant_id(
2588        self, variant_id_column: str = "variant_id", force: bool = None
2589    ) -> str:
2590        """
2591        It adds a column to the variants table called `variant_id` and populates it with a hash of the
2592        `#CHROM`, `POS`, `REF`, and `ALT` columns
2593
2594        :param variant_id_column: The name of the column to be created in the variants table, defaults
2595        to variant_id
2596        :type variant_id_column: str (optional)
2597        :param force: If True, the variant_id column will be created even if it already exists
2598        :type force: bool
2599        :return: The name of the column that contains the variant_id
2600        """
2601
2602        # Assembly
2603        assembly = self.get_param().get(
2604            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
2605        )
2606
2607        # INFO/Tag prefix
2608        prefix = self.get_explode_infos_prefix()
2609
2610        # Explode INFO/SVTYPE
2611        added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"])
2612
2613        # variants table
2614        table_variants = self.get_table_variants()
2615
2616        # variant_id column
2617        if not variant_id_column:
2618            variant_id_column = "variant_id"
2619
2620        # Creta variant_id column
2621        if "variant_id" not in self.get_extra_infos() or force:
2622
2623            # Create column
2624            self.add_column(
2625                table_name=table_variants,
2626                column_name=variant_id_column,
2627                column_type="UBIGINT",
2628                default_value="0",
2629            )
2630
2631            # Update column
2632            self.conn.execute(
2633                f"""
2634                    UPDATE {table_variants}
2635                    SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"')
2636                """
2637            )
2638
2639        # Remove added columns
2640        for added_column in added_columns:
2641            self.drop_column(column=added_column)
2642
2643        # return variant_id column name
2644        return variant_id_column
2645
2646    def get_variant_id_column(
2647        self, variant_id_column: str = "variant_id", force: bool = None
2648    ) -> str:
2649        """
2650        This function returns the variant_id column name
2651
2652        :param variant_id_column: The name of the column in the dataframe that contains the variant IDs,
2653        defaults to variant_id
2654        :type variant_id_column: str (optional)
2655        :param force: If True, will force the variant_id to be set to the value of variant_id_column. If
2656        False, will only set the variant_id if it is not already set. If None, will set the variant_id
2657        if it is not already set, or if it is set
2658        :type force: bool
2659        :return: The variant_id column name.
2660        """
2661
2662        return self.set_variant_id(variant_id_column=variant_id_column, force=force)
2663
2664    ###
2665    # Annotation
2666    ###
2667
2668    def scan_databases(
2669        self,
2670        database_formats: list = ["parquet"],
2671        database_releases: list = ["current"],
2672    ) -> dict:
2673        """
2674        The function `scan_databases` scans for available databases based on specified formats and
2675        releases.
2676
2677        :param database_formats: The `database_formats` parameter is a list that specifies the formats
2678        of the databases to be scanned. In this case, the accepted format is "parquet"
2679        :type database_formats: list ["parquet"]
2680        :param database_releases: The `database_releases` parameter is a list that specifies the
2681        releases of the databases to be scanned. In the provided function, the default value for
2682        `database_releases` is set to `["current"]`, meaning that by default, the function will scan
2683        databases that are in the "current"
2684        :type database_releases: list
2685        :return: The function `scan_databases` returns a dictionary containing information about
2686        databases that match the specified formats and releases.
2687        """
2688
2689        # Config
2690        config = self.get_config()
2691
2692        # Param
2693        param = self.get_param()
2694
2695        # Param - Assembly
2696        assembly = param.get("assembly", config.get("assembly", None))
2697        if not assembly:
2698            assembly = DEFAULT_ASSEMBLY
2699            log.warning(f"Default assembly '{assembly}'")
2700
2701        # Scan for availabled databases
2702        log.info(
2703            f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..."
2704        )
2705        databases_infos_dict = databases_infos(
2706            database_folder_releases=database_releases,
2707            database_formats=database_formats,
2708            assembly=assembly,
2709            config=config,
2710        )
2711        log.info(
2712            f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found"
2713        )
2714
2715        return databases_infos_dict
2716
2717    def annotation(self) -> None:
2718        """
2719        It annotates the VCF file with the annotations specified in the config file.
2720        """
2721
2722        # Config
2723        config = self.get_config()
2724
2725        # Param
2726        param = self.get_param()
2727
2728        # Param - Assembly
2729        assembly = param.get("assembly", config.get("assembly", None))
2730        if not assembly:
2731            assembly = DEFAULT_ASSEMBLY
2732            log.warning(f"Default assembly '{assembly}'")
2733
2734        # annotations databases folders
2735        annotations_databases = set(
2736            config.get("folders", {})
2737            .get("databases", {})
2738            .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER])
2739            + config.get("folders", {})
2740            .get("databases", {})
2741            .get("parquet", ["~/howard/databases/parquet/current"])
2742            + config.get("folders", {})
2743            .get("databases", {})
2744            .get("bcftools", ["~/howard/databases/bcftools/current"])
2745        )
2746
2747        # Get param annotations
2748        if param.get("annotations", None) and isinstance(
2749            param.get("annotations", None), str
2750        ):
2751            log.debug(param.get("annotations", None))
2752            param_annotation_list = param.get("annotations").split(",")
2753        else:
2754            param_annotation_list = []
2755
2756        # Each tools param
2757        if param.get("annotation_parquet", None) != None:
2758            log.debug(
2759                f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}"""
2760            )
2761            if isinstance(param.get("annotation_parquet", None), list):
2762                param_annotation_list.append(",".join(param.get("annotation_parquet")))
2763            else:
2764                param_annotation_list.append(param.get("annotation_parquet"))
2765        if param.get("annotation_snpsift", None) != None:
2766            if isinstance(param.get("annotation_snpsift", None), list):
2767                param_annotation_list.append(
2768                    "snpsift:"
2769                    + "+".join(param.get("annotation_snpsift")).replace(",", "+")
2770                )
2771            else:
2772                param_annotation_list.append(
2773                    "snpsift:" + param.get("annotation_snpsift").replace(",", "+")
2774                )
2775        if param.get("annotation_snpeff", None) != None:
2776            param_annotation_list.append("snpeff:" + param.get("annotation_snpeff"))
2777        if param.get("annotation_bcftools", None) != None:
2778            if isinstance(param.get("annotation_bcftools", None), list):
2779                param_annotation_list.append(
2780                    "bcftools:"
2781                    + "+".join(param.get("annotation_bcftools")).replace(",", "+")
2782                )
2783            else:
2784                param_annotation_list.append(
2785                    "bcftools:" + param.get("annotation_bcftools").replace(",", "+")
2786                )
2787        if param.get("annotation_annovar", None) != None:
2788            param_annotation_list.append("annovar:" + param.get("annotation_annovar"))
2789        if param.get("annotation_exomiser", None) != None:
2790            param_annotation_list.append("exomiser:" + param.get("annotation_exomiser"))
2791        if param.get("annotation_splice", None) != None:
2792            param_annotation_list.append("splice:" + param.get("annotation_splice"))
2793
2794        # Merge param annotations list
2795        param["annotations"] = ",".join(param_annotation_list)
2796
2797        # debug
2798        log.debug(f"param_annotations={param['annotations']}")
2799
2800        if param.get("annotations"):
2801
2802            # Log
2803            # log.info("Annotations - Check annotation parameters")
2804
2805            if not "annotation" in param:
2806                param["annotation"] = {}
2807
2808            # List of annotations parameters
2809            annotations_list_input = {}
2810            if isinstance(param.get("annotations", None), str):
2811                annotation_file_list = [
2812                    value for value in param.get("annotations", "").split(",")
2813                ]
2814                for annotation_file in annotation_file_list:
2815                    annotations_list_input[annotation_file] = {"INFO": None}
2816            else:
2817                annotations_list_input = param.get("annotations", {})
2818
2819            log.info(f"Quick Annotations:")
2820            for annotation_key in list(annotations_list_input.keys()):
2821                log.info(f"   {annotation_key}")
2822
2823            # List of annotations and associated fields
2824            annotations_list = {}
2825
2826            for annotation_file in annotations_list_input:
2827
2828                # Explode annotations if ALL
2829                if (
2830                    annotation_file.upper() == "ALL"
2831                    or annotation_file.upper().startswith("ALL:")
2832                ):
2833
2834                    # check ALL parameters (formats, releases)
2835                    annotation_file_split = annotation_file.split(":")
2836                    database_formats = "parquet"
2837                    database_releases = "current"
2838                    for annotation_file_option in annotation_file_split[1:]:
2839                        database_all_options_split = annotation_file_option.split("=")
2840                        if database_all_options_split[0] == "format":
2841                            database_formats = database_all_options_split[1].split("+")
2842                        if database_all_options_split[0] == "release":
2843                            database_releases = database_all_options_split[1].split("+")
2844
2845                    # Scan for availabled databases
2846                    databases_infos_dict = self.scan_databases(
2847                        database_formats=database_formats,
2848                        database_releases=database_releases,
2849                    )
2850
2851                    # Add found databases in annotation parameters
2852                    for database_infos in databases_infos_dict.keys():
2853                        annotations_list[database_infos] = {"INFO": None}
2854
2855                else:
2856                    annotations_list[annotation_file] = annotations_list_input[
2857                        annotation_file
2858                    ]
2859
2860            # Check each databases
2861            if len(annotations_list):
2862
2863                log.info(
2864                    f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..."
2865                )
2866
2867                for annotation_file in annotations_list:
2868
2869                    # Init
2870                    annotations = annotations_list.get(annotation_file, None)
2871
2872                    # Annotation snpEff
2873                    if annotation_file.startswith("snpeff"):
2874
2875                        log.debug(f"Quick Annotation snpEff")
2876
2877                        if "snpeff" not in param["annotation"]:
2878                            param["annotation"]["snpeff"] = {}
2879
2880                        if "options" not in param["annotation"]["snpeff"]:
2881                            param["annotation"]["snpeff"]["options"] = ""
2882
2883                        # snpEff options in annotations
2884                        param["annotation"]["snpeff"]["options"] = "".join(
2885                            annotation_file.split(":")[1:]
2886                        )
2887
2888                    # Annotation Annovar
2889                    elif annotation_file.startswith("annovar"):
2890
2891                        log.debug(f"Quick Annotation Annovar")
2892
2893                        if "annovar" not in param["annotation"]:
2894                            param["annotation"]["annovar"] = {}
2895
2896                        if "annotations" not in param["annotation"]["annovar"]:
2897                            param["annotation"]["annovar"]["annotations"] = {}
2898
2899                        # Options
2900                        annotation_file_split = annotation_file.split(":")
2901                        for annotation_file_annotation in annotation_file_split[1:]:
2902                            if annotation_file_annotation:
2903                                param["annotation"]["annovar"]["annotations"][
2904                                    annotation_file_annotation
2905                                ] = annotations
2906
2907                    # Annotation Exomiser
2908                    elif annotation_file.startswith("exomiser"):
2909
2910                        log.debug(f"Quick Annotation Exomiser")
2911
2912                        param["annotation"]["exomiser"] = params_string_to_dict(
2913                            annotation_file
2914                        )
2915
2916                    # Annotation Splice
2917                    elif annotation_file.startswith("splice"):
2918
2919                        log.debug(f"Quick Annotation Splice")
2920
2921                        param["annotation"]["splice"] = params_string_to_dict(
2922                            annotation_file
2923                        )
2924
2925                    # Annotation Parquet or BCFTOOLS
2926                    else:
2927
2928                        # Tools detection
2929                        if annotation_file.startswith("bcftools:"):
2930                            annotation_tool_initial = "bcftools"
2931                            annotation_file = ":".join(annotation_file.split(":")[1:])
2932                        elif annotation_file.startswith("snpsift:"):
2933                            annotation_tool_initial = "snpsift"
2934                            annotation_file = ":".join(annotation_file.split(":")[1:])
2935                        else:
2936                            annotation_tool_initial = None
2937
2938                        # list of files
2939                        annotation_file_list = annotation_file.replace("+", ":").split(
2940                            ":"
2941                        )
2942
2943                        for annotation_file in annotation_file_list:
2944
2945                            if annotation_file:
2946
2947                                # Annotation tool initial
2948                                annotation_tool = annotation_tool_initial
2949
2950                                # Find file
2951                                annotation_file_found = None
2952
2953                                # Expand user
2954                                annotation_file = full_path(annotation_file)
2955
2956                                if os.path.exists(annotation_file):
2957                                    annotation_file_found = annotation_file
2958
2959                                else:
2960                                    # Find within assembly folders
2961                                    for annotations_database in annotations_databases:
2962                                        found_files = find_all(
2963                                            annotation_file,
2964                                            os.path.join(
2965                                                annotations_database, assembly
2966                                            ),
2967                                        )
2968                                        if len(found_files) > 0:
2969                                            annotation_file_found = found_files[0]
2970                                            break
2971                                    if not annotation_file_found and not assembly:
2972                                        # Find within folders
2973                                        for (
2974                                            annotations_database
2975                                        ) in annotations_databases:
2976                                            found_files = find_all(
2977                                                annotation_file, annotations_database
2978                                            )
2979                                            if len(found_files) > 0:
2980                                                annotation_file_found = found_files[0]
2981                                                break
2982                                log.debug(
2983                                    f"for {annotation_file} annotation_file_found={annotation_file_found}"
2984                                )
2985
2986                                # Full path
2987                                annotation_file_found = full_path(annotation_file_found)
2988
2989                                if annotation_file_found:
2990
2991                                    database = Database(database=annotation_file_found)
2992                                    quick_annotation_format = database.get_format()
2993                                    quick_annotation_is_compressed = (
2994                                        database.is_compressed()
2995                                    )
2996                                    quick_annotation_is_indexed = os.path.exists(
2997                                        f"{annotation_file_found}.tbi"
2998                                    )
2999                                    bcftools_preference = False
3000
3001                                    # Check Annotation Tool
3002                                    if not annotation_tool:
3003                                        if (
3004                                            bcftools_preference
3005                                            and quick_annotation_format
3006                                            in ["vcf", "bed"]
3007                                            and quick_annotation_is_compressed
3008                                            and quick_annotation_is_indexed
3009                                        ):
3010                                            annotation_tool = "bcftools"
3011                                        elif quick_annotation_format in [
3012                                            "vcf",
3013                                            "bed",
3014                                            "tsv",
3015                                            "tsv",
3016                                            "csv",
3017                                            "json",
3018                                            "tbl",
3019                                            "parquet",
3020                                            "duckdb",
3021                                        ]:
3022                                            annotation_tool = "parquet"
3023                                        else:
3024                                            log.error(
3025                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
3026                                            )
3027                                            raise ValueError(
3028                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
3029                                            )
3030
3031                                    log.debug(
3032                                        f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}"
3033                                    )
3034
3035                                    # Annotation Tool dispatch
3036                                    if annotation_tool:
3037                                        if annotation_tool not in param["annotation"]:
3038                                            param["annotation"][annotation_tool] = {}
3039                                        if (
3040                                            "annotations"
3041                                            not in param["annotation"][annotation_tool]
3042                                        ):
3043                                            param["annotation"][annotation_tool][
3044                                                "annotations"
3045                                            ] = {}
3046                                        param["annotation"][annotation_tool][
3047                                            "annotations"
3048                                        ][annotation_file_found] = annotations
3049
3050                                else:
3051                                    log.error(
3052                                        f"Quick Annotation File {annotation_file} does NOT exist"
3053                                    )
3054
3055                self.set_param(param)
3056
3057        if param.get("annotation", None):
3058            log.info("Annotations")
3059            if param.get("annotation", {}).get("parquet", None):
3060                log.info("Annotations 'parquet'...")
3061                self.annotation_parquet()
3062            if param.get("annotation", {}).get("bcftools", None):
3063                log.info("Annotations 'bcftools'...")
3064                self.annotation_bcftools()
3065            if param.get("annotation", {}).get("snpsift", None):
3066                log.info("Annotations 'snpsift'...")
3067                self.annotation_snpsift()
3068            if param.get("annotation", {}).get("annovar", None):
3069                log.info("Annotations 'annovar'...")
3070                self.annotation_annovar()
3071            if param.get("annotation", {}).get("snpeff", None):
3072                log.info("Annotations 'snpeff'...")
3073                self.annotation_snpeff()
3074            if param.get("annotation", {}).get("exomiser", None) is not None:
3075                log.info("Annotations 'exomiser'...")
3076                self.annotation_exomiser()
3077            if param.get("annotation", {}).get("splice", None) is not None:
3078                log.info("Annotations 'splice' ...")
3079                self.annotation_splice()
3080
3081        # Explode INFOS fields into table fields
3082        if self.get_explode_infos():
3083            self.explode_infos(
3084                prefix=self.get_explode_infos_prefix(),
3085                fields=self.get_explode_infos_fields(),
3086                force=True,
3087            )
3088
3089    def annotation_snpsift(self, threads: int = None) -> None:
3090        """
3091        This function annotate with bcftools
3092
3093        :param threads: Number of threads to use
3094        :return: the value of the variable "return_value".
3095        """
3096
3097        # DEBUG
3098        log.debug("Start annotation with bcftools databases")
3099
3100        # Threads
3101        if not threads:
3102            threads = self.get_threads()
3103        log.debug("Threads: " + str(threads))
3104
3105        # Config
3106        config = self.get_config()
3107        log.debug("Config: " + str(config))
3108
3109        # Config - snpSift
3110        snpsift_bin_command = get_bin_command(
3111            bin="SnpSift.jar",
3112            tool="snpsift",
3113            bin_type="jar",
3114            config=config,
3115            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
3116        )
3117        if not snpsift_bin_command:
3118            msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'"
3119            log.error(msg_err)
3120            raise ValueError(msg_err)
3121
3122        # Config - bcftools
3123        bcftools_bin_command = get_bin_command(
3124            bin="bcftools",
3125            tool="bcftools",
3126            bin_type="bin",
3127            config=config,
3128            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
3129        )
3130        if not bcftools_bin_command:
3131            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
3132            log.error(msg_err)
3133            raise ValueError(msg_err)
3134
3135        # Config - BCFTools databases folders
3136        databases_folders = set(
3137            self.get_config()
3138            .get("folders", {})
3139            .get("databases", {})
3140            .get("annotations", ["."])
3141            + self.get_config()
3142            .get("folders", {})
3143            .get("databases", {})
3144            .get("bcftools", ["."])
3145        )
3146        log.debug("Databases annotations: " + str(databases_folders))
3147
3148        # Param
3149        annotations = (
3150            self.get_param()
3151            .get("annotation", {})
3152            .get("snpsift", {})
3153            .get("annotations", None)
3154        )
3155        log.debug("Annotations: " + str(annotations))
3156
3157        # Assembly
3158        assembly = self.get_param().get(
3159            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
3160        )
3161
3162        # Data
3163        table_variants = self.get_table_variants()
3164
3165        # Check if not empty
3166        log.debug("Check if not empty")
3167        sql_query_chromosomes = (
3168            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
3169        )
3170        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
3171        if not sql_query_chromosomes_df["count"][0]:
3172            log.info(f"VCF empty")
3173            return
3174
3175        # VCF header
3176        vcf_reader = self.get_header()
3177        log.debug("Initial header: " + str(vcf_reader.infos))
3178
3179        # Existing annotations
3180        for vcf_annotation in self.get_header().infos:
3181
3182            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
3183            log.debug(
3184                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
3185            )
3186
3187        if annotations:
3188
3189            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
3190
3191                # Export VCF file
3192                tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz")
3193
3194                # Init
3195                commands = {}
3196
3197                for annotation in annotations:
3198                    annotation_fields = annotations[annotation]
3199
3200                    # Annotation Name
3201                    annotation_name = os.path.basename(annotation)
3202
3203                    if not annotation_fields:
3204                        annotation_fields = {"INFO": None}
3205
3206                    log.debug(f"Annotation '{annotation_name}'")
3207                    log.debug(
3208                        f"Annotation '{annotation_name}' - fields: {annotation_fields}"
3209                    )
3210
3211                    # Create Database
3212                    database = Database(
3213                        database=annotation,
3214                        databases_folders=databases_folders,
3215                        assembly=assembly,
3216                    )
3217
3218                    # Find files
3219                    db_file = database.get_database()
3220                    db_file = full_path(db_file)
3221                    db_hdr_file = database.get_header_file()
3222                    db_hdr_file = full_path(db_hdr_file)
3223                    db_file_type = database.get_format()
3224                    db_tbi_file = f"{db_file}.tbi"
3225                    db_file_compressed = database.is_compressed()
3226
3227                    # Check if compressed
3228                    if not db_file_compressed:
3229                        log.error(
3230                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
3231                        )
3232                        raise ValueError(
3233                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
3234                        )
3235
3236                    # Check if indexed
3237                    if not os.path.exists(db_tbi_file):
3238                        log.error(
3239                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
3240                        )
3241                        raise ValueError(
3242                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
3243                        )
3244
3245                    # Check index - try to create if not exists
3246                    if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
3247                        log.error("Annotation failed: database not valid")
3248                        log.error(f"Annotation annotation file: {db_file}")
3249                        log.error(f"Annotation annotation header: {db_hdr_file}")
3250                        log.error(f"Annotation annotation index: {db_tbi_file}")
3251                        raise ValueError(
3252                            f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
3253                        )
3254                    else:
3255
3256                        log.debug(
3257                            f"Annotation '{annotation}' - file: "
3258                            + str(db_file)
3259                            + " and "
3260                            + str(db_hdr_file)
3261                        )
3262
3263                        # Load header as VCF object
3264                        db_hdr_vcf = Variants(input=db_hdr_file)
3265                        db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
3266                        log.debug(
3267                            "Annotation database header: "
3268                            + str(db_hdr_vcf_header_infos)
3269                        )
3270
3271                        # For all fields in database
3272                        annotation_fields_full = False
3273                        if "ALL" in annotation_fields or "INFO" in annotation_fields:
3274                            annotation_fields = {
3275                                key: key for key in db_hdr_vcf_header_infos
3276                            }
3277                            log.debug(
3278                                "Annotation database header - All annotations added: "
3279                                + str(annotation_fields)
3280                            )
3281                            annotation_fields_full = True
3282
3283                        # # Create file for field rename
3284                        # log.debug("Create file for field rename")
3285                        # tmp_rename = NamedTemporaryFile(
3286                        #     prefix=self.get_prefix(),
3287                        #     dir=self.get_tmp_dir(),
3288                        #     suffix=".rename",
3289                        #     delete=False,
3290                        # )
3291                        # tmp_rename_name = tmp_rename.name
3292                        # tmp_files.append(tmp_rename_name)
3293
3294                        # Number of fields
3295                        nb_annotation_field = 0
3296                        annotation_list = []
3297                        annotation_infos_rename_list = []
3298
3299                        for annotation_field in annotation_fields:
3300
3301                            # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
3302                            annotation_fields_new_name = annotation_fields.get(
3303                                annotation_field, annotation_field
3304                            )
3305                            if not annotation_fields_new_name:
3306                                annotation_fields_new_name = annotation_field
3307
3308                            # Check if field is in DB and if field is not elready in input data
3309                            if (
3310                                annotation_field in db_hdr_vcf.get_header().infos
3311                                and annotation_fields_new_name
3312                                not in self.get_header().infos
3313                            ):
3314
3315                                log.info(
3316                                    f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
3317                                )
3318
3319                                # BCFTools annotate param to rename fields
3320                                if annotation_field != annotation_fields_new_name:
3321                                    annotation_infos_rename_list.append(
3322                                        f"{annotation_fields_new_name}:=INFO/{annotation_field}"
3323                                    )
3324
3325                                # Add INFO field to header
3326                                db_hdr_vcf_header_infos_number = (
3327                                    db_hdr_vcf_header_infos[annotation_field].num or "."
3328                                )
3329                                db_hdr_vcf_header_infos_type = (
3330                                    db_hdr_vcf_header_infos[annotation_field].type
3331                                    or "String"
3332                                )
3333                                db_hdr_vcf_header_infos_description = (
3334                                    db_hdr_vcf_header_infos[annotation_field].desc
3335                                    or f"{annotation_field} description"
3336                                )
3337                                db_hdr_vcf_header_infos_source = (
3338                                    db_hdr_vcf_header_infos[annotation_field].source
3339                                    or "unknown"
3340                                )
3341                                db_hdr_vcf_header_infos_version = (
3342                                    db_hdr_vcf_header_infos[annotation_field].version
3343                                    or "unknown"
3344                                )
3345
3346                                vcf_reader.infos[annotation_fields_new_name] = (
3347                                    vcf.parser._Info(
3348                                        annotation_fields_new_name,
3349                                        db_hdr_vcf_header_infos_number,
3350                                        db_hdr_vcf_header_infos_type,
3351                                        db_hdr_vcf_header_infos_description,
3352                                        db_hdr_vcf_header_infos_source,
3353                                        db_hdr_vcf_header_infos_version,
3354                                        self.code_type_map[
3355                                            db_hdr_vcf_header_infos_type
3356                                        ],
3357                                    )
3358                                )
3359
3360                                annotation_list.append(annotation_field)
3361
3362                                nb_annotation_field += 1
3363
3364                            else:
3365
3366                                if (
3367                                    annotation_field
3368                                    not in db_hdr_vcf.get_header().infos
3369                                ):
3370                                    log.warning(
3371                                        f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file"
3372                                    )
3373                                if (
3374                                    annotation_fields_new_name
3375                                    in self.get_header().infos
3376                                ):
3377                                    log.warning(
3378                                        f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)"
3379                                    )
3380
3381                        log.info(
3382                            f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
3383                        )
3384
3385                        annotation_infos = ",".join(annotation_list)
3386
3387                        if annotation_infos != "":
3388
3389                            # Annotated VCF (and error file)
3390                            tmp_annotation_vcf_name = os.path.join(
3391                                tmp_dir, os.path.basename(annotation) + ".vcf.gz"
3392                            )
3393                            tmp_annotation_vcf_name_err = (
3394                                tmp_annotation_vcf_name + ".err"
3395                            )
3396
3397                            # Add fields to annotate
3398                            if not annotation_fields_full:
3399                                annotation_infos_option = f"-info {annotation_infos}"
3400                            else:
3401                                annotation_infos_option = ""
3402
3403                            # Info fields rename
3404                            if annotation_infos_rename_list:
3405                                annotation_infos_rename = " -c " + ",".join(
3406                                    annotation_infos_rename_list
3407                                )
3408                            else:
3409                                annotation_infos_rename = ""
3410
3411                            # Annotate command
3412                            command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
3413
3414                            # Add command
3415                            commands[command_annotate] = tmp_annotation_vcf_name
3416
3417                if commands:
3418
3419                    # Export VCF file
3420                    self.export_variant_vcf(
3421                        vcf_file=tmp_vcf_name,
3422                        remove_info=True,
3423                        add_samples=False,
3424                        index=True,
3425                    )
3426                    shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf")
3427
3428                    # Num command
3429                    nb_command = 0
3430
3431                    # Annotate
3432                    for command_annotate in commands:
3433                        nb_command += 1
3434                        log.info(
3435                            f"Annotation - Annotate [{nb_command}/{len(commands)}]..."
3436                        )
3437                        log.debug(f"command_annotate={command_annotate}")
3438                        run_parallel_commands([command_annotate], threads)
3439
3440                        # Debug
3441                        shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf")
3442
3443                        # Update variants
3444                        log.info(
3445                            f"Annotation - Updating [{nb_command}/{len(commands)}]..."
3446                        )
3447                        self.update_from_vcf(commands[command_annotate])
3448
3449    def annotation_bcftools(self, threads: int = None) -> None:
3450        """
3451        This function annotate with bcftools
3452
3453        :param threads: Number of threads to use
3454        :return: the value of the variable "return_value".
3455        """
3456
3457        # DEBUG
3458        log.debug("Start annotation with bcftools databases")
3459
3460        # Threads
3461        if not threads:
3462            threads = self.get_threads()
3463        log.debug("Threads: " + str(threads))
3464
3465        # Config
3466        config = self.get_config()
3467        log.debug("Config: " + str(config))
3468
3469        # DEBUG
3470        delete_tmp = True
3471        if self.get_config().get("verbosity", "warning") in ["debug"]:
3472            delete_tmp = False
3473            log.debug("Delete tmp files/folders: " + str(delete_tmp))
3474
3475        # Config - BCFTools bin command
3476        bcftools_bin_command = get_bin_command(
3477            bin="bcftools",
3478            tool="bcftools",
3479            bin_type="bin",
3480            config=config,
3481            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
3482        )
3483        if not bcftools_bin_command:
3484            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
3485            log.error(msg_err)
3486            raise ValueError(msg_err)
3487
3488        # Config - BCFTools databases folders
3489        databases_folders = set(
3490            self.get_config()
3491            .get("folders", {})
3492            .get("databases", {})
3493            .get("annotations", ["."])
3494            + self.get_config()
3495            .get("folders", {})
3496            .get("databases", {})
3497            .get("bcftools", ["."])
3498        )
3499        log.debug("Databases annotations: " + str(databases_folders))
3500
3501        # Param
3502        annotations = (
3503            self.get_param()
3504            .get("annotation", {})
3505            .get("bcftools", {})
3506            .get("annotations", None)
3507        )
3508        log.debug("Annotations: " + str(annotations))
3509
3510        # Assembly
3511        assembly = self.get_param().get(
3512            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
3513        )
3514
3515        # Data
3516        table_variants = self.get_table_variants()
3517
3518        # Check if not empty
3519        log.debug("Check if not empty")
3520        sql_query_chromosomes = (
3521            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
3522        )
3523        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
3524        if not sql_query_chromosomes_df["count"][0]:
3525            log.info(f"VCF empty")
3526            return
3527
3528        # Export in VCF
3529        log.debug("Create initial file to annotate")
3530        tmp_vcf = NamedTemporaryFile(
3531            prefix=self.get_prefix(),
3532            dir=self.get_tmp_dir(),
3533            suffix=".vcf.gz",
3534            delete=False,
3535        )
3536        tmp_vcf_name = tmp_vcf.name
3537
3538        # VCF header
3539        vcf_reader = self.get_header()
3540        log.debug("Initial header: " + str(vcf_reader.infos))
3541
3542        # Existing annotations
3543        for vcf_annotation in self.get_header().infos:
3544
3545            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
3546            log.debug(
3547                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
3548            )
3549
3550        if annotations:
3551
3552            tmp_ann_vcf_list = []
3553            commands = []
3554            tmp_files = []
3555            err_files = []
3556
3557            for annotation in annotations:
3558                annotation_fields = annotations[annotation]
3559
3560                # Annotation Name
3561                annotation_name = os.path.basename(annotation)
3562
3563                if not annotation_fields:
3564                    annotation_fields = {"INFO": None}
3565
3566                log.debug(f"Annotation '{annotation_name}'")
3567                log.debug(
3568                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
3569                )
3570
3571                # Create Database
3572                database = Database(
3573                    database=annotation,
3574                    databases_folders=databases_folders,
3575                    assembly=assembly,
3576                )
3577
3578                # Find files
3579                db_file = database.get_database()
3580                db_file = full_path(db_file)
3581                db_hdr_file = database.get_header_file()
3582                db_hdr_file = full_path(db_hdr_file)
3583                db_file_type = database.get_format()
3584                db_tbi_file = f"{db_file}.tbi"
3585                db_file_compressed = database.is_compressed()
3586
3587                # Check if compressed
3588                if not db_file_compressed:
3589                    log.error(
3590                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
3591                    )
3592                    raise ValueError(
3593                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
3594                    )
3595
3596                # Check if indexed
3597                if not os.path.exists(db_tbi_file):
3598                    log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file")
3599                    raise ValueError(
3600                        f"Annotation '{annotation}' - {db_file} NOT indexed file"
3601                    )
3602
3603                # Check index - try to create if not exists
3604                if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
3605                    log.error("Annotation failed: database not valid")
3606                    log.error(f"Annotation annotation file: {db_file}")
3607                    log.error(f"Annotation annotation header: {db_hdr_file}")
3608                    log.error(f"Annotation annotation index: {db_tbi_file}")
3609                    raise ValueError(
3610                        f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
3611                    )
3612                else:
3613
3614                    log.debug(
3615                        f"Annotation '{annotation}' - file: "
3616                        + str(db_file)
3617                        + " and "
3618                        + str(db_hdr_file)
3619                    )
3620
3621                    # Load header as VCF object
3622                    db_hdr_vcf = Variants(input=db_hdr_file)
3623                    db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
3624                    log.debug(
3625                        "Annotation database header: " + str(db_hdr_vcf_header_infos)
3626                    )
3627
3628                    # For all fields in database
3629                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
3630                        annotation_fields = {
3631                            key: key for key in db_hdr_vcf_header_infos
3632                        }
3633                        log.debug(
3634                            "Annotation database header - All annotations added: "
3635                            + str(annotation_fields)
3636                        )
3637
3638                    # Number of fields
3639                    nb_annotation_field = 0
3640                    annotation_list = []
3641
3642                    for annotation_field in annotation_fields:
3643
3644                        # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
3645                        annotation_fields_new_name = annotation_fields.get(
3646                            annotation_field, annotation_field
3647                        )
3648                        if not annotation_fields_new_name:
3649                            annotation_fields_new_name = annotation_field
3650
3651                        # Check if field is in DB and if field is not elready in input data
3652                        if (
3653                            annotation_field in db_hdr_vcf.get_header().infos
3654                            and annotation_fields_new_name
3655                            not in self.get_header().infos
3656                        ):
3657
3658                            log.info(
3659                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
3660                            )
3661
3662                            # Add INFO field to header
3663                            db_hdr_vcf_header_infos_number = (
3664                                db_hdr_vcf_header_infos[annotation_field].num or "."
3665                            )
3666                            db_hdr_vcf_header_infos_type = (
3667                                db_hdr_vcf_header_infos[annotation_field].type
3668                                or "String"
3669                            )
3670                            db_hdr_vcf_header_infos_description = (
3671                                db_hdr_vcf_header_infos[annotation_field].desc
3672                                or f"{annotation_field} description"
3673                            )
3674                            db_hdr_vcf_header_infos_source = (
3675                                db_hdr_vcf_header_infos[annotation_field].source
3676                                or "unknown"
3677                            )
3678                            db_hdr_vcf_header_infos_version = (
3679                                db_hdr_vcf_header_infos[annotation_field].version
3680                                or "unknown"
3681                            )
3682
3683                            vcf_reader.infos[annotation_fields_new_name] = (
3684                                vcf.parser._Info(
3685                                    annotation_fields_new_name,
3686                                    db_hdr_vcf_header_infos_number,
3687                                    db_hdr_vcf_header_infos_type,
3688                                    db_hdr_vcf_header_infos_description,
3689                                    db_hdr_vcf_header_infos_source,
3690                                    db_hdr_vcf_header_infos_version,
3691                                    self.code_type_map[db_hdr_vcf_header_infos_type],
3692                                )
3693                            )
3694
3695                            # annotation_list.append(annotation_field)
3696                            if annotation_field != annotation_fields_new_name:
3697                                annotation_list.append(
3698                                    f"{annotation_fields_new_name}:=INFO/{annotation_field}"
3699                                )
3700                            else:
3701                                annotation_list.append(annotation_field)
3702
3703                            nb_annotation_field += 1
3704
3705                        else:
3706
3707                            if annotation_field not in db_hdr_vcf.get_header().infos:
3708                                log.warning(
3709                                    f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file"
3710                                )
3711                            if annotation_fields_new_name in self.get_header().infos:
3712                                log.warning(
3713                                    f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
3714                                )
3715
3716                    log.info(
3717                        f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
3718                    )
3719
3720                    annotation_infos = ",".join(annotation_list)
3721
3722                    if annotation_infos != "":
3723
3724                        # Protect header for bcftools (remove "#CHROM" and variants line)
3725                        log.debug("Protect Header file - remove #CHROM line if exists")
3726                        tmp_header_vcf = NamedTemporaryFile(
3727                            prefix=self.get_prefix(),
3728                            dir=self.get_tmp_dir(),
3729                            suffix=".hdr",
3730                            delete=False,
3731                        )
3732                        tmp_header_vcf_name = tmp_header_vcf.name
3733                        tmp_files.append(tmp_header_vcf_name)
3734                        # Command
3735                        if db_hdr_file.endswith(".gz"):
3736                            command_extract_header = f"zcat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
3737                        else:
3738                            command_extract_header = f"cat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
3739                        # Run
3740                        run_parallel_commands([command_extract_header], 1)
3741
3742                        # Find chomosomes
3743                        log.debug("Find chromosomes ")
3744                        sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\""""
3745                        sql_query_chromosomes_df = self.get_query_to_df(
3746                            sql_query_chromosomes
3747                        )
3748                        chomosomes_list = list(sql_query_chromosomes_df["CHROM"])
3749
3750                        log.debug("Chromosomes found: " + str(list(chomosomes_list)))
3751
3752                        # BED columns in the annotation file
3753                        if db_file_type in ["bed"]:
3754                            annotation_infos = "CHROM,POS,POS," + annotation_infos
3755
3756                        for chrom in chomosomes_list:
3757
3758                            # Create BED on initial VCF
3759                            log.debug("Create BED on initial VCF: " + str(tmp_vcf_name))
3760                            tmp_bed = NamedTemporaryFile(
3761                                prefix=self.get_prefix(),
3762                                dir=self.get_tmp_dir(),
3763                                suffix=".bed",
3764                                delete=False,
3765                            )
3766                            tmp_bed_name = tmp_bed.name
3767                            tmp_files.append(tmp_bed_name)
3768
3769                            # Detecte regions
3770                            log.debug(
3771                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..."
3772                            )
3773                            window = 1000000
3774                            sql_query_intervals_for_bed = f"""
3775                                SELECT  \"#CHROM\",
3776                                        CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END,
3777                                        \"POS\"+{window}
3778                                FROM {table_variants} as table_variants
3779                                WHERE table_variants.\"#CHROM\" = '{chrom}'
3780                            """
3781                            regions = self.conn.execute(
3782                                sql_query_intervals_for_bed
3783                            ).fetchall()
3784                            merged_regions = merge_regions(regions)
3785                            log.debug(
3786                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..."
3787                            )
3788
3789                            header = ["#CHROM", "START", "END"]
3790                            with open(tmp_bed_name, "w") as f:
3791                                # Write the header with tab delimiter
3792                                f.write("\t".join(header) + "\n")
3793                                for d in merged_regions:
3794                                    # Write each data row with tab delimiter
3795                                    f.write("\t".join(map(str, d)) + "\n")
3796
3797                            # Tmp files
3798                            tmp_annotation_vcf = NamedTemporaryFile(
3799                                prefix=self.get_prefix(),
3800                                dir=self.get_tmp_dir(),
3801                                suffix=".vcf.gz",
3802                                delete=False,
3803                            )
3804                            tmp_annotation_vcf_name = tmp_annotation_vcf.name
3805                            tmp_files.append(tmp_annotation_vcf_name)
3806                            tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}")
3807                            tmp_annotation_vcf_name_err = (
3808                                tmp_annotation_vcf_name + ".err"
3809                            )
3810                            err_files.append(tmp_annotation_vcf_name_err)
3811
3812                            # Annotate Command
3813                            log.debug(
3814                                f"Annotation '{annotation}' - add bcftools command"
3815                            )
3816
3817                            # Command
3818                            command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
3819
3820                            # Add command
3821                            commands.append(command_annotate)
3822
3823            # if some commands
3824            if commands:
3825
3826                # Export VCF file
3827                self.export_variant_vcf(
3828                    vcf_file=tmp_vcf_name,
3829                    remove_info=True,
3830                    add_samples=False,
3831                    index=True,
3832                )
3833
3834                # Threads
3835                # calculate threads for annotated commands
3836                if commands:
3837                    threads_bcftools_annotate = round(threads / len(commands))
3838                else:
3839                    threads_bcftools_annotate = 1
3840
3841                if not threads_bcftools_annotate:
3842                    threads_bcftools_annotate = 1
3843
3844                # Add threads option to bcftools commands
3845                if threads_bcftools_annotate > 1:
3846                    commands_threaded = []
3847                    for command in commands:
3848                        commands_threaded.append(
3849                            command.replace(
3850                                f"{bcftools_bin_command} annotate ",
3851                                f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ",
3852                            )
3853                        )
3854                    commands = commands_threaded
3855
3856                # Command annotation multithreading
3857                log.debug(f"Annotation - Annotation commands: " + str(commands))
3858                log.info(
3859                    f"Annotation - Annotation multithreaded in "
3860                    + str(len(commands))
3861                    + " commands"
3862                )
3863
3864                run_parallel_commands(commands, threads)
3865
3866                # Merge
3867                tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list)
3868
3869                if tmp_ann_vcf_list_cmd:
3870
3871                    # Tmp file
3872                    tmp_annotate_vcf = NamedTemporaryFile(
3873                        prefix=self.get_prefix(),
3874                        dir=self.get_tmp_dir(),
3875                        suffix=".vcf.gz",
3876                        delete=True,
3877                    )
3878                    tmp_annotate_vcf_name = tmp_annotate_vcf.name
3879                    tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
3880                    err_files.append(tmp_annotate_vcf_name_err)
3881
3882                    # Tmp file remove command
3883                    tmp_files_remove_command = ""
3884                    if tmp_files:
3885                        tmp_files_remove_command = " && rm -f " + " ".join(tmp_files)
3886
3887                    # Command merge
3888                    merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}"
3889                    log.info(
3890                        f"Annotation - Annotation merging "
3891                        + str(len(commands))
3892                        + " annotated files"
3893                    )
3894                    log.debug(f"Annotation - merge command: {merge_command}")
3895                    run_parallel_commands([merge_command], 1)
3896
3897                    # Error messages
3898                    log.info(f"Error/Warning messages:")
3899                    error_message_command_all = []
3900                    error_message_command_warning = []
3901                    error_message_command_err = []
3902                    for err_file in err_files:
3903                        with open(err_file, "r") as f:
3904                            for line in f:
3905                                message = line.strip()
3906                                error_message_command_all.append(message)
3907                                if line.startswith("[W::"):
3908                                    error_message_command_warning.append(message)
3909                                if line.startswith("[E::"):
3910                                    error_message_command_err.append(
3911                                        f"{err_file}: " + message
3912                                    )
3913                    # log info
3914                    for message in list(
3915                        set(error_message_command_err + error_message_command_warning)
3916                    ):
3917                        log.info(f"   {message}")
3918                    # debug info
3919                    for message in list(set(error_message_command_all)):
3920                        log.debug(f"   {message}")
3921                    # failed
3922                    if len(error_message_command_err):
3923                        log.error("Annotation failed: Error in commands")
3924                        raise ValueError("Annotation failed: Error in commands")
3925
3926                    # Update variants
3927                    log.info(f"Annotation - Updating...")
3928                    self.update_from_vcf(tmp_annotate_vcf_name)
3929
3930    def annotation_exomiser(self, threads: int = None) -> None:
3931        """
3932        This function annotate with Exomiser
3933
3934        This function uses args as parameters, in section "annotation" -> "exomiser", with sections:
3935        - "analysis" (dict/file):
3936            Full analysis dictionnary parameters (see Exomiser docs).
3937            Either a dict, or a file in JSON or YAML format.
3938            These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO)
3939            Default : None
3940        - "preset" (string):
3941            Analysis preset (available in config folder).
3942            Used if no full "analysis" is provided.
3943            Default: "exome"
3944        - "phenopacket" (dict/file):
3945            Samples and phenotipic features parameters (see Exomiser docs).
3946            Either a dict, or a file in JSON or YAML format.
3947            Default: None
3948        - "subject" (dict):
3949            Sample parameters (see Exomiser docs).
3950            Example:
3951                "subject":
3952                    {
3953                        "id": "ISDBM322017",
3954                        "sex": "FEMALE"
3955                    }
3956            Default: None
3957        - "sample" (string):
3958            Sample name to construct "subject" section:
3959                "subject":
3960                    {
3961                        "id": "<sample>",
3962                        "sex": "UNKNOWN_SEX"
3963                    }
3964            Default: None
3965        - "phenotypicFeatures" (dict)
3966            Phenotypic features to construct "subject" section.
3967            Example:
3968                "phenotypicFeatures":
3969                    [
3970                        { "type": { "id": "HP:0001159", "label": "Syndactyly" } },
3971                        { "type": { "id": "HP:0000486", "label": "Strabismus" } }
3972                    ]
3973        - "hpo" (list)
3974            List of HPO ids as phenotypic features.
3975            Example:
3976                "hpo": ['0001156', '0001363', '0011304', '0010055']
3977            Default: []
3978        - "outputOptions" (dict):
3979            Output options (see Exomiser docs).
3980            Default:
3981                "output_options" =
3982                    {
3983                        "outputContributingVariantsOnly": False,
3984                        "numGenes": 0,
3985                        "outputFormats": ["TSV_VARIANT", "VCF"]
3986                    }
3987        - "transcript_source" (string):
3988            Transcript source (either "refseq", "ucsc", "ensembl")
3989            Default: "refseq"
3990        - "exomiser_to_info" (boolean):
3991            Add exomiser TSV file columns as INFO fields in VCF.
3992            Default: False
3993        - "release" (string):
3994            Exomise database release.
3995            If not exists, database release will be downloaded (take a while).
3996            Default: None (provided by application.properties configuration file)
3997        - "exomiser_application_properties" (file):
3998            Exomiser configuration file (see Exomiser docs).
3999            Useful to automatically download databases (especially for specific genome databases).
4000
4001        Notes:
4002        - If no sample in parameters, first sample in VCF will be chosen
4003        - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
4004
4005        :param threads: The number of threads to use
4006        :return: None.
4007        """
4008
4009        # DEBUG
4010        log.debug("Start annotation with Exomiser databases")
4011
4012        # Threads
4013        if not threads:
4014            threads = self.get_threads()
4015        log.debug("Threads: " + str(threads))
4016
4017        # Config
4018        config = self.get_config()
4019        log.debug("Config: " + str(config))
4020
4021        # Config - Folders - Databases
4022        databases_folders = (
4023            config.get("folders", {})
4024            .get("databases", {})
4025            .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current")
4026        )
4027        databases_folders = full_path(databases_folders)
4028        if not os.path.exists(databases_folders):
4029            log.error(f"Databases annotations: {databases_folders} NOT found")
4030        log.debug("Databases annotations: " + str(databases_folders))
4031
4032        # Config - Exomiser
4033        exomiser_bin_command = get_bin_command(
4034            bin="exomiser-cli*.jar",
4035            tool="exomiser",
4036            bin_type="jar",
4037            config=config,
4038            default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser",
4039        )
4040        log.debug("Exomiser bin command: " + str(exomiser_bin_command))
4041        if not exomiser_bin_command:
4042            msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'"
4043            log.error(msg_err)
4044            raise ValueError(msg_err)
4045
4046        # Param
4047        param = self.get_param()
4048        log.debug("Param: " + str(param))
4049
4050        # Param - Exomiser
4051        param_exomiser = param.get("annotation", {}).get("exomiser", {})
4052        log.debug(f"Param Exomiser: {param_exomiser}")
4053
4054        # Param - Assembly
4055        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
4056        log.debug("Assembly: " + str(assembly))
4057
4058        # Data
4059        table_variants = self.get_table_variants()
4060
4061        # Check if not empty
4062        log.debug("Check if not empty")
4063        sql_query_chromosomes = (
4064            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
4065        )
4066        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
4067            log.info(f"VCF empty")
4068            return False
4069
4070        # VCF header
4071        vcf_reader = self.get_header()
4072        log.debug("Initial header: " + str(vcf_reader.infos))
4073
4074        # Samples
4075        samples = self.get_header_sample_list()
4076        if not samples:
4077            log.error("No Samples in VCF")
4078            return False
4079        log.debug(f"Samples: {samples}")
4080
4081        # Memory limit
4082        memory_limit = self.get_memory("8G")
4083        log.debug(f"memory_limit: {memory_limit}")
4084
4085        # Exomiser java options
4086        exomiser_java_options = (
4087            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
4088        )
4089        log.debug(f"Exomiser java options: {exomiser_java_options}")
4090
4091        # Download Exomiser (if not exists)
4092        exomiser_release = param_exomiser.get("release", None)
4093        exomiser_application_properties = param_exomiser.get(
4094            "exomiser_application_properties", None
4095        )
4096        databases_download_exomiser(
4097            assemblies=[assembly],
4098            exomiser_folder=databases_folders,
4099            exomiser_release=exomiser_release,
4100            exomiser_phenotype_release=exomiser_release,
4101            exomiser_application_properties=exomiser_application_properties,
4102        )
4103
4104        # Force annotation
4105        force_update_annotation = True
4106
4107        if "Exomiser" not in self.get_header().infos or force_update_annotation:
4108            log.debug("Start annotation Exomiser")
4109
4110            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
4111
4112                # tmp_dir = "/tmp/exomiser"
4113
4114                ### ANALYSIS ###
4115                ################
4116
4117                # Create analysis.json through analysis dict
4118                # either analysis in param or by default
4119                # depending on preset exome/genome)
4120
4121                # Init analysis dict
4122                param_exomiser_analysis_dict = {}
4123
4124                # analysis from param
4125                param_exomiser_analysis = param_exomiser.get("analysis", {})
4126                param_exomiser_analysis = full_path(param_exomiser_analysis)
4127
4128                # If analysis in param -> load anlaysis json
4129                if param_exomiser_analysis:
4130
4131                    # If param analysis is a file and exists
4132                    if isinstance(param_exomiser_analysis, str) and os.path.exists(
4133                        param_exomiser_analysis
4134                    ):
4135                        # Load analysis file into analysis dict (either yaml or json)
4136                        with open(param_exomiser_analysis) as json_file:
4137                            param_exomiser_analysis_dict = yaml.safe_load(json_file)
4138
4139                    # If param analysis is a dict
4140                    elif isinstance(param_exomiser_analysis, dict):
4141                        # Load analysis dict into analysis dict (either yaml or json)
4142                        param_exomiser_analysis_dict = param_exomiser_analysis
4143
4144                    # Error analysis type
4145                    else:
4146                        log.error(f"Analysis type unknown. Check param file.")
4147                        raise ValueError(f"Analysis type unknown. Check param file.")
4148
4149                # Case no input analysis config file/dict
4150                # Use preset (exome/genome) to open default config file
4151                if not param_exomiser_analysis_dict:
4152
4153                    # default preset
4154                    default_preset = "exome"
4155
4156                    # Get param preset or default preset
4157                    param_exomiser_preset = param_exomiser.get("preset", default_preset)
4158
4159                    # Try to find if preset is a file
4160                    if os.path.exists(param_exomiser_preset):
4161                        # Preset file is provided in full path
4162                        param_exomiser_analysis_default_config_file = (
4163                            param_exomiser_preset
4164                        )
4165                    # elif os.path.exists(full_path(param_exomiser_preset)):
4166                    #     # Preset file is provided in full path
4167                    #     param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset)
4168                    elif os.path.exists(
4169                        os.path.join(folder_config, param_exomiser_preset)
4170                    ):
4171                        # Preset file is provided a basename in config folder (can be a path with subfolders)
4172                        param_exomiser_analysis_default_config_file = os.path.join(
4173                            folder_config, param_exomiser_preset
4174                        )
4175                    else:
4176                        # Construct preset file
4177                        param_exomiser_analysis_default_config_file = os.path.join(
4178                            folder_config,
4179                            f"preset-{param_exomiser_preset}-analysis.json",
4180                        )
4181
4182                    # If preset file exists
4183                    param_exomiser_analysis_default_config_file = full_path(
4184                        param_exomiser_analysis_default_config_file
4185                    )
4186                    if os.path.exists(param_exomiser_analysis_default_config_file):
4187                        # Load prest file into analysis dict (either yaml or json)
4188                        with open(
4189                            param_exomiser_analysis_default_config_file
4190                        ) as json_file:
4191                            # param_exomiser_analysis_dict[""] = json.load(json_file)
4192                            param_exomiser_analysis_dict["analysis"] = yaml.safe_load(
4193                                json_file
4194                            )
4195
4196                    # Error preset file
4197                    else:
4198                        log.error(
4199                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
4200                        )
4201                        raise ValueError(
4202                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
4203                        )
4204
4205                # If no analysis dict created
4206                if not param_exomiser_analysis_dict:
4207                    log.error(f"No analysis config")
4208                    raise ValueError(f"No analysis config")
4209
4210                # Log
4211                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
4212
4213                ### PHENOPACKET ###
4214                ###################
4215
4216                # If no PhenoPacket in analysis dict -> check in param
4217                if "phenopacket" not in param_exomiser_analysis_dict:
4218
4219                    # If PhenoPacket in param -> load anlaysis json
4220                    if param_exomiser.get("phenopacket", None):
4221
4222                        param_exomiser_phenopacket = param_exomiser.get("phenopacket")
4223                        param_exomiser_phenopacket = full_path(
4224                            param_exomiser_phenopacket
4225                        )
4226
4227                        # If param phenopacket is a file and exists
4228                        if isinstance(
4229                            param_exomiser_phenopacket, str
4230                        ) and os.path.exists(param_exomiser_phenopacket):
4231                            # Load phenopacket file into analysis dict (either yaml or json)
4232                            with open(param_exomiser_phenopacket) as json_file:
4233                                param_exomiser_analysis_dict["phenopacket"] = (
4234                                    yaml.safe_load(json_file)
4235                                )
4236
4237                        # If param phenopacket is a dict
4238                        elif isinstance(param_exomiser_phenopacket, dict):
4239                            # Load phenopacket dict into analysis dict (either yaml or json)
4240                            param_exomiser_analysis_dict["phenopacket"] = (
4241                                param_exomiser_phenopacket
4242                            )
4243
4244                        # Error phenopacket type
4245                        else:
4246                            log.error(f"Phenopacket type unknown. Check param file.")
4247                            raise ValueError(
4248                                f"Phenopacket type unknown. Check param file."
4249                            )
4250
4251                # If no PhenoPacket in analysis dict -> construct from sample and HPO in param
4252                if "phenopacket" not in param_exomiser_analysis_dict:
4253
4254                    # Init PhenoPacket
4255                    param_exomiser_analysis_dict["phenopacket"] = {
4256                        "id": "analysis",
4257                        "proband": {},
4258                    }
4259
4260                    ### Add subject ###
4261
4262                    # If subject exists
4263                    param_exomiser_subject = param_exomiser.get("subject", {})
4264
4265                    # If subject not exists -> found sample ID
4266                    if not param_exomiser_subject:
4267
4268                        # Found sample ID in param
4269                        sample = param_exomiser.get("sample", None)
4270
4271                        # Find sample ID (first sample)
4272                        if not sample:
4273                            sample_list = self.get_header_sample_list()
4274                            if len(sample_list) > 0:
4275                                sample = sample_list[0]
4276                            else:
4277                                log.error(f"No sample found")
4278                                raise ValueError(f"No sample found")
4279
4280                        # Create subject
4281                        param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"}
4282
4283                    # Add to dict
4284                    param_exomiser_analysis_dict["phenopacket"][
4285                        "subject"
4286                    ] = param_exomiser_subject
4287
4288                    ### Add "phenotypicFeatures" ###
4289
4290                    # If phenotypicFeatures exists
4291                    param_exomiser_phenotypicfeatures = param_exomiser.get(
4292                        "phenotypicFeatures", []
4293                    )
4294
4295                    # If phenotypicFeatures not exists -> Try to infer from hpo list
4296                    if not param_exomiser_phenotypicfeatures:
4297
4298                        # Found HPO in param
4299                        param_exomiser_hpo = param_exomiser.get("hpo", [])
4300
4301                        # Split HPO if list in string format separated by comma
4302                        if isinstance(param_exomiser_hpo, str):
4303                            param_exomiser_hpo = param_exomiser_hpo.split(",")
4304
4305                        # Create HPO list
4306                        for hpo in param_exomiser_hpo:
4307                            hpo_clean = re.sub("[^0-9]", "", hpo)
4308                            param_exomiser_phenotypicfeatures.append(
4309                                {
4310                                    "type": {
4311                                        "id": f"HP:{hpo_clean}",
4312                                        "label": f"HP:{hpo_clean}",
4313                                    }
4314                                }
4315                            )
4316
4317                    # Add to dict
4318                    param_exomiser_analysis_dict["phenopacket"][
4319                        "phenotypicFeatures"
4320                    ] = param_exomiser_phenotypicfeatures
4321
4322                    # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step
4323                    if not param_exomiser_phenotypicfeatures:
4324                        for step in param_exomiser_analysis_dict.get(
4325                            "analysis", {}
4326                        ).get("steps", []):
4327                            if "hiPhivePrioritiser" in step:
4328                                param_exomiser_analysis_dict.get("analysis", {}).get(
4329                                    "steps", []
4330                                ).remove(step)
4331
4332                ### Add Input File ###
4333
4334                # Initial file name and htsFiles
4335                tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz")
4336                param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [
4337                    {
4338                        "uri": tmp_vcf_name,
4339                        "htsFormat": "VCF",
4340                        "genomeAssembly": assembly,
4341                    }
4342                ]
4343
4344                ### Add metaData ###
4345
4346                # If metaData not in analysis dict
4347                if "metaData" not in param_exomiser_analysis_dict:
4348                    param_exomiser_analysis_dict["phenopacket"]["metaData"] = {
4349                        "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z",
4350                        "createdBy": "howard",
4351                        "phenopacketSchemaVersion": 1,
4352                    }
4353
4354                ### OutputOptions ###
4355
4356                # Init output result folder
4357                output_results = os.path.join(tmp_dir, "results")
4358
4359                # If no outputOptions in analysis dict
4360                if "outputOptions" not in param_exomiser_analysis_dict:
4361
4362                    # default output formats
4363                    defaut_output_formats = ["TSV_VARIANT", "VCF"]
4364
4365                    # Get outputOptions in param
4366                    output_options = param_exomiser.get("outputOptions", None)
4367
4368                    # If no output_options in param -> check
4369                    if not output_options:
4370                        output_options = {
4371                            "outputContributingVariantsOnly": False,
4372                            "numGenes": 0,
4373                            "outputFormats": defaut_output_formats,
4374                        }
4375
4376                    # Replace outputDirectory in output options
4377                    output_options["outputDirectory"] = output_results
4378                    output_options["outputFileName"] = "howard"
4379
4380                    # Add outputOptions in analysis dict
4381                    param_exomiser_analysis_dict["outputOptions"] = output_options
4382
4383                else:
4384
4385                    # Replace output_results and output format (if exists in param)
4386                    param_exomiser_analysis_dict["outputOptions"][
4387                        "outputDirectory"
4388                    ] = output_results
4389                    param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = (
4390                        list(
4391                            set(
4392                                param_exomiser_analysis_dict.get(
4393                                    "outputOptions", {}
4394                                ).get("outputFormats", [])
4395                                + ["TSV_VARIANT", "VCF"]
4396                            )
4397                        )
4398                    )
4399
4400                # log
4401                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
4402
4403                ### ANALYSIS FILE ###
4404                #####################
4405
4406                ### Full JSON analysis config file ###
4407
4408                exomiser_analysis = os.path.join(tmp_dir, "analysis.json")
4409                with open(exomiser_analysis, "w") as fp:
4410                    json.dump(param_exomiser_analysis_dict, fp, indent=4)
4411
4412                ### SPLIT analysis and sample config files
4413
4414                # Splitted analysis dict
4415                param_exomiser_analysis_dict_for_split = (
4416                    param_exomiser_analysis_dict.copy()
4417                )
4418
4419                # Phenopacket JSON file
4420                exomiser_analysis_phenopacket = os.path.join(
4421                    tmp_dir, "analysis_phenopacket.json"
4422                )
4423                with open(exomiser_analysis_phenopacket, "w") as fp:
4424                    json.dump(
4425                        param_exomiser_analysis_dict_for_split.get("phenopacket"),
4426                        fp,
4427                        indent=4,
4428                    )
4429
4430                # Analysis JSON file without Phenopacket parameters
4431                param_exomiser_analysis_dict_for_split.pop("phenopacket")
4432                exomiser_analysis_analysis = os.path.join(
4433                    tmp_dir, "analysis_analysis.json"
4434                )
4435                with open(exomiser_analysis_analysis, "w") as fp:
4436                    json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4)
4437
4438                ### INITAL VCF file ###
4439                #######################
4440
4441                ### Create list of samples to use and include inti initial VCF file ####
4442
4443                # Subject (main sample)
4444                # Get sample ID in analysis dict
4445                sample_subject = (
4446                    param_exomiser_analysis_dict.get("phenopacket", {})
4447                    .get("subject", {})
4448                    .get("id", None)
4449                )
4450                sample_proband = (
4451                    param_exomiser_analysis_dict.get("phenopacket", {})
4452                    .get("proband", {})
4453                    .get("subject", {})
4454                    .get("id", None)
4455                )
4456                sample = []
4457                if sample_subject:
4458                    sample.append(sample_subject)
4459                if sample_proband:
4460                    sample.append(sample_proband)
4461
4462                # Get sample ID within Pedigree
4463                pedigree_persons_list = (
4464                    param_exomiser_analysis_dict.get("phenopacket", {})
4465                    .get("pedigree", {})
4466                    .get("persons", {})
4467                )
4468
4469                # Create list with all sample ID in pedigree (if exists)
4470                pedigree_persons = []
4471                for person in pedigree_persons_list:
4472                    pedigree_persons.append(person.get("individualId"))
4473
4474                # Concat subject sample ID and samples ID in pedigreesamples
4475                samples = list(set(sample + pedigree_persons))
4476
4477                # Check if sample list is not empty
4478                if not samples:
4479                    log.error(f"No samples found")
4480                    raise ValueError(f"No samples found")
4481
4482                # Create VCF with sample (either sample in param or first one by default)
4483                # Export VCF file
4484                self.export_variant_vcf(
4485                    vcf_file=tmp_vcf_name,
4486                    remove_info=True,
4487                    add_samples=True,
4488                    list_samples=samples,
4489                    index=False,
4490                )
4491
4492                ### Execute Exomiser ###
4493                ########################
4494
4495                # Init command
4496                exomiser_command = ""
4497
4498                # Command exomiser options
4499                exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} "
4500
4501                # Release
4502                exomiser_release = param_exomiser.get("release", None)
4503                if exomiser_release:
4504                    # phenotype data version
4505                    exomiser_options += (
4506                        f" --exomiser.phenotype.data-version={exomiser_release} "
4507                    )
4508                    # data version
4509                    exomiser_options += (
4510                        f" --exomiser.{assembly}.data-version={exomiser_release} "
4511                    )
4512                    # variant white list
4513                    variant_white_list_file = (
4514                        f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz"
4515                    )
4516                    if os.path.exists(
4517                        os.path.join(
4518                            databases_folders, assembly, variant_white_list_file
4519                        )
4520                    ):
4521                        exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} "
4522
4523                # transcript_source
4524                transcript_source = param_exomiser.get(
4525                    "transcript_source", None
4526                )  # ucsc, refseq, ensembl
4527                if transcript_source:
4528                    exomiser_options += (
4529                        f" --exomiser.{assembly}.transcript-source={transcript_source} "
4530                    )
4531
4532                # If analysis contain proband param
4533                if param_exomiser_analysis_dict.get("phenopacket", {}).get(
4534                    "proband", {}
4535                ):
4536                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} "
4537
4538                # If no proband (usually uniq sample)
4539                else:
4540                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}"
4541
4542                # Log
4543                log.debug(f"exomiser_command_analysis={exomiser_command_analysis}")
4544
4545                # Run command
4546                result = subprocess.call(
4547                    exomiser_command_analysis.split(), stdout=subprocess.PIPE
4548                )
4549                if result:
4550                    log.error("Exomiser command failed")
4551                    raise ValueError("Exomiser command failed")
4552
4553                ### RESULTS ###
4554                ###############
4555
4556                ### Annotate with TSV fields ###
4557
4558                # Init result tsv file
4559                exomiser_to_info = param_exomiser.get("exomiser_to_info", False)
4560
4561                # Init result tsv file
4562                output_results_tsv = os.path.join(output_results, "howard.variants.tsv")
4563
4564                # Parse TSV file and explode columns in INFO field
4565                if exomiser_to_info and os.path.exists(output_results_tsv):
4566
4567                    # Log
4568                    log.debug("Exomiser columns to VCF INFO field")
4569
4570                    # Retrieve columns and types
4571                    query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """
4572                    output_results_tsv_df = self.get_query_to_df(query)
4573                    output_results_tsv_columns = output_results_tsv_df.columns.tolist()
4574
4575                    # Init concat fields for update
4576                    sql_query_update_concat_fields = []
4577
4578                    # Fields to avoid
4579                    fields_to_avoid = [
4580                        "CONTIG",
4581                        "START",
4582                        "END",
4583                        "REF",
4584                        "ALT",
4585                        "QUAL",
4586                        "FILTER",
4587                        "GENOTYPE",
4588                    ]
4589
4590                    # List all columns to add into header
4591                    for header_column in output_results_tsv_columns:
4592
4593                        # If header column is enable
4594                        if header_column not in fields_to_avoid:
4595
4596                            # Header info type
4597                            header_info_type = "String"
4598                            header_column_df = output_results_tsv_df[header_column]
4599                            header_column_df_dtype = header_column_df.dtype
4600                            if header_column_df_dtype == object:
4601                                if (
4602                                    pd.to_numeric(header_column_df, errors="coerce")
4603                                    .notnull()
4604                                    .all()
4605                                ):
4606                                    header_info_type = "Float"
4607                            else:
4608                                header_info_type = "Integer"
4609
4610                            # Header info
4611                            characters_to_validate = ["-"]
4612                            pattern = "[" + "".join(characters_to_validate) + "]"
4613                            header_info_name = re.sub(
4614                                pattern,
4615                                "_",
4616                                f"Exomiser_{header_column}".replace("#", ""),
4617                            )
4618                            header_info_number = "."
4619                            header_info_description = (
4620                                f"Exomiser {header_column} annotation"
4621                            )
4622                            header_info_source = "Exomiser"
4623                            header_info_version = "unknown"
4624                            header_info_code = CODE_TYPE_MAP[header_info_type]
4625                            vcf_reader.infos[header_info_name] = vcf.parser._Info(
4626                                header_info_name,
4627                                header_info_number,
4628                                header_info_type,
4629                                header_info_description,
4630                                header_info_source,
4631                                header_info_version,
4632                                header_info_code,
4633                            )
4634
4635                            # Add field to add for update to concat fields
4636                            sql_query_update_concat_fields.append(
4637                                f"""
4638                                CASE
4639                                    WHEN table_parquet."{header_column}" NOT IN ('','.')
4640                                    THEN concat(
4641                                        '{header_info_name}=',
4642                                        table_parquet."{header_column}",
4643                                        ';'
4644                                        )
4645
4646                                    ELSE ''
4647                                END
4648                            """
4649                            )
4650
4651                    # Update query
4652                    sql_query_update = f"""
4653                        UPDATE {table_variants} as table_variants
4654                            SET INFO = concat(
4655                                            CASE
4656                                                WHEN INFO NOT IN ('', '.')
4657                                                THEN INFO
4658                                                ELSE ''
4659                                            END,
4660                                            CASE
4661                                                WHEN table_variants.INFO NOT IN ('','.')
4662                                                THEN ';'
4663                                                ELSE ''
4664                                            END,
4665                                            (
4666                                            SELECT 
4667                                                concat(
4668                                                    {",".join(sql_query_update_concat_fields)}
4669                                                )
4670                                            FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet
4671                                                    WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\"
4672                                                    AND table_parquet.\"START\" = table_variants.\"POS\"
4673                                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
4674                                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
4675                                            )
4676                                        )
4677                            ;
4678                        """
4679
4680                    # Update
4681                    self.conn.execute(sql_query_update)
4682
4683                ### Annotate with VCF INFO field ###
4684
4685                # Init result VCF file
4686                output_results_vcf = os.path.join(output_results, "howard.vcf.gz")
4687
4688                # If VCF exists
4689                if os.path.exists(output_results_vcf):
4690
4691                    # Log
4692                    log.debug("Exomiser result VCF update variants")
4693
4694                    # Find Exomiser INFO field annotation in header
4695                    with gzip.open(output_results_vcf, "rt") as f:
4696                        header_list = self.read_vcf_header(f)
4697                    exomiser_vcf_header = vcf.Reader(
4698                        io.StringIO("\n".join(header_list))
4699                    )
4700
4701                    # Add annotation INFO field to header
4702                    vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"]
4703
4704                    # Update variants with VCF
4705                    self.update_from_vcf(output_results_vcf)
4706
4707        return True
4708
4709    def annotation_snpeff(self, threads: int = None) -> None:
4710        """
4711        This function annotate with snpEff
4712
4713        :param threads: The number of threads to use
4714        :return: the value of the variable "return_value".
4715        """
4716
4717        # DEBUG
4718        log.debug("Start annotation with snpeff databases")
4719
4720        # Threads
4721        if not threads:
4722            threads = self.get_threads()
4723        log.debug("Threads: " + str(threads))
4724
4725        # DEBUG
4726        delete_tmp = True
4727        if self.get_config().get("verbosity", "warning") in ["debug"]:
4728            delete_tmp = False
4729            log.debug("Delete tmp files/folders: " + str(delete_tmp))
4730
4731        # Config
4732        config = self.get_config()
4733        log.debug("Config: " + str(config))
4734
4735        # Config - Folders - Databases
4736        databases_folders = (
4737            config.get("folders", {}).get("databases", {}).get("snpeff", ["."])
4738        )
4739        log.debug("Databases annotations: " + str(databases_folders))
4740
4741        # # Config - Java
4742        # java_bin = get_bin(
4743        #     tool="java",
4744        #     bin="java",
4745        #     bin_type="bin",
4746        #     config=config,
4747        #     default_folder="/usr/bin",
4748        # )
4749        # if not (os.path.exists(java_bin) or (java_bin and which(java_bin))):
4750        #     log.error(f"Annotation failed: no java bin '{java_bin}'")
4751        #     raise ValueError(f"Annotation failed: no java bin '{java_bin}'")
4752
4753        # # Config - snpEff bin
4754        # snpeff_jar = get_bin(
4755        #     tool="snpeff",
4756        #     bin="snpEff.jar",
4757        #     bin_type="jar",
4758        #     config=config,
4759        #     default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
4760        # )
4761        # if not (os.path.exists(snpeff_jar) or (snpeff_jar and which(snpeff_jar))):
4762        #     log.error(f"Annotation failed: no snpEff jar '{snpeff_jar}'")
4763        #     raise ValueError(f"Annotation failed: no snpEff jar '{snpeff_jar}'")
4764
4765        # Config - snpEff bin command
4766        snpeff_bin_command = get_bin_command(
4767            bin="snpEff.jar",
4768            tool="snpeff",
4769            bin_type="jar",
4770            config=config,
4771            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
4772        )
4773        if not snpeff_bin_command:
4774            msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'"
4775            log.error(msg_err)
4776            raise ValueError(msg_err)
4777
4778        # Config - snpEff databases
4779        snpeff_databases = (
4780            config.get("folders", {})
4781            .get("databases", {})
4782            .get("snpeff", DEFAULT_SNPEFF_FOLDER)
4783        )
4784        snpeff_databases = full_path(snpeff_databases)
4785        if snpeff_databases is not None and snpeff_databases != "":
4786            log.debug(f"Create snpEff databases folder")
4787            if not os.path.exists(snpeff_databases):
4788                os.makedirs(snpeff_databases)
4789
4790        # Param
4791        param = self.get_param()
4792        log.debug("Param: " + str(param))
4793
4794        # Param
4795        options = param.get("annotation", {}).get("snpeff", {}).get("options", None)
4796        log.debug("Options: " + str(options))
4797
4798        # Param - Assembly
4799        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
4800
4801        # Param - Options
4802        snpeff_options = (
4803            param.get("annotation", {}).get("snpeff", {}).get("options", "")
4804        )
4805        snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None)
4806        snpeff_csvstats = (
4807            param.get("annotation", {}).get("snpeff", {}).get("csvStats", None)
4808        )
4809        if snpeff_stats:
4810            snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output())
4811            snpeff_stats = full_path(snpeff_stats)
4812            snpeff_options += f" -stats {snpeff_stats}"
4813        if snpeff_csvstats:
4814            snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output())
4815            snpeff_csvstats = full_path(snpeff_csvstats)
4816            snpeff_options += f" -csvStats {snpeff_csvstats}"
4817
4818        # Data
4819        table_variants = self.get_table_variants()
4820
4821        # Check if not empty
4822        log.debug("Check if not empty")
4823        sql_query_chromosomes = (
4824            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
4825        )
4826        # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]:
4827        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
4828            log.info(f"VCF empty")
4829            return
4830
4831        # Export in VCF
4832        log.debug("Create initial file to annotate")
4833        tmp_vcf = NamedTemporaryFile(
4834            prefix=self.get_prefix(),
4835            dir=self.get_tmp_dir(),
4836            suffix=".vcf.gz",
4837            delete=True,
4838        )
4839        tmp_vcf_name = tmp_vcf.name
4840
4841        # VCF header
4842        vcf_reader = self.get_header()
4843        log.debug("Initial header: " + str(vcf_reader.infos))
4844
4845        # Existing annotations
4846        for vcf_annotation in self.get_header().infos:
4847
4848            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
4849            log.debug(
4850                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
4851            )
4852
4853        # Memory limit
4854        # if config.get("memory", None):
4855        #     memory_limit = config.get("memory", "8G")
4856        # else:
4857        #     memory_limit = "8G"
4858        memory_limit = self.get_memory("8G")
4859        log.debug(f"memory_limit: {memory_limit}")
4860
4861        # snpEff java options
4862        snpeff_java_options = (
4863            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
4864        )
4865        log.debug(f"Exomiser java options: {snpeff_java_options}")
4866
4867        force_update_annotation = True
4868
4869        if "ANN" not in self.get_header().infos or force_update_annotation:
4870
4871            # Check snpEff database
4872            log.debug(f"Check snpEff databases {[assembly]}")
4873            databases_download_snpeff(
4874                folder=snpeff_databases, assemblies=[assembly], config=config
4875            )
4876
4877            # Export VCF file
4878            self.export_variant_vcf(
4879                vcf_file=tmp_vcf_name,
4880                remove_info=True,
4881                add_samples=False,
4882                index=True,
4883            )
4884
4885            # Tmp file
4886            err_files = []
4887            tmp_annotate_vcf = NamedTemporaryFile(
4888                prefix=self.get_prefix(),
4889                dir=self.get_tmp_dir(),
4890                suffix=".vcf",
4891                delete=False,
4892            )
4893            tmp_annotate_vcf_name = tmp_annotate_vcf.name
4894            tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
4895            err_files.append(tmp_annotate_vcf_name_err)
4896
4897            # Command
4898            snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}"
4899            log.debug(f"Annotation - snpEff command: {snpeff_command}")
4900            run_parallel_commands([snpeff_command], 1)
4901
4902            # Error messages
4903            log.info(f"Error/Warning messages:")
4904            error_message_command_all = []
4905            error_message_command_warning = []
4906            error_message_command_err = []
4907            for err_file in err_files:
4908                with open(err_file, "r") as f:
4909                    for line in f:
4910                        message = line.strip()
4911                        error_message_command_all.append(message)
4912                        if line.startswith("[W::"):
4913                            error_message_command_warning.append(message)
4914                        if line.startswith("[E::"):
4915                            error_message_command_err.append(f"{err_file}: " + message)
4916            # log info
4917            for message in list(
4918                set(error_message_command_err + error_message_command_warning)
4919            ):
4920                log.info(f"   {message}")
4921            # debug info
4922            for message in list(set(error_message_command_all)):
4923                log.debug(f"   {message}")
4924            # failed
4925            if len(error_message_command_err):
4926                log.error("Annotation failed: Error in commands")
4927                raise ValueError("Annotation failed: Error in commands")
4928
4929            # Find annotation in header
4930            with open(tmp_annotate_vcf_name, "rt") as f:
4931                header_list = self.read_vcf_header(f)
4932            annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
4933
4934            for ann in annovar_vcf_header.infos:
4935                if ann not in self.get_header().infos:
4936                    vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
4937
4938            # Update variants
4939            log.info(f"Annotation - Updating...")
4940            self.update_from_vcf(tmp_annotate_vcf_name)
4941
4942        else:
4943            if "ANN" in self.get_header().infos:
4944                log.debug(f"Existing snpEff annotations in VCF")
4945            if force_update_annotation:
4946                log.debug(f"Existing snpEff annotations in VCF - annotation forced")
4947
4948    def annotation_annovar(self, threads: int = None) -> None:
4949        """
4950        It takes a VCF file, annotates it with Annovar, and then updates the database with the new
4951        annotations
4952
4953        :param threads: number of threads to use
4954        :return: the value of the variable "return_value".
4955        """
4956
4957        # DEBUG
4958        log.debug("Start annotation with Annovar databases")
4959
4960        # Threads
4961        if not threads:
4962            threads = self.get_threads()
4963        log.debug("Threads: " + str(threads))
4964
4965        # Tmp en Err files
4966        tmp_files = []
4967        err_files = []
4968
4969        # DEBUG
4970        delete_tmp = True
4971        if self.get_config().get("verbosity", "warning") in ["debug"]:
4972            delete_tmp = False
4973            log.debug("Delete tmp files/folders: " + str(delete_tmp))
4974
4975        # Config
4976        config = self.get_config()
4977        log.debug("Config: " + str(config))
4978
4979        # Config - Folders - Databases
4980        databases_folders = (
4981            config.get("folders", {}).get("databases", {}).get("annovar", ["."])
4982        )
4983        log.debug("Databases annotations: " + str(databases_folders))
4984
4985        # Config - annovar bin command
4986        annovar_bin_command = get_bin_command(
4987            bin="table_annovar.pl",
4988            tool="annovar",
4989            bin_type="perl",
4990            config=config,
4991            default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar",
4992        )
4993        if not annovar_bin_command:
4994            msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'"
4995            log.error(msg_err)
4996            raise ValueError(msg_err)
4997
4998        # Config - BCFTools bin command
4999        bcftools_bin_command = get_bin_command(
5000            bin="bcftools",
5001            tool="bcftools",
5002            bin_type="bin",
5003            config=config,
5004            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
5005        )
5006        if not bcftools_bin_command:
5007            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
5008            log.error(msg_err)
5009            raise ValueError(msg_err)
5010
5011        # Config - annovar databases
5012        annovar_databases = (
5013            config.get("folders", {})
5014            .get("databases", {})
5015            .get("annovar", DEFAULT_ANNOVAR_FOLDER)
5016        )
5017        annovar_databases = full_path(annovar_databases)
5018        if annovar_databases != "" and not os.path.exists(annovar_databases):
5019            os.makedirs(annovar_databases)
5020
5021        # Param
5022        param = self.get_param()
5023        log.debug("Param: " + str(param))
5024
5025        # Param - options
5026        options = param.get("annotation", {}).get("annovar", {}).get("options", {})
5027        log.debug("Options: " + str(options))
5028
5029        # Param - annotations
5030        annotations = (
5031            param.get("annotation", {}).get("annovar", {}).get("annotations", {})
5032        )
5033        log.debug("Annotations: " + str(annotations))
5034
5035        # Param - Assembly
5036        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
5037
5038        # Annovar database assembly
5039        annovar_databases_assembly = f"{annovar_databases}/{assembly}"
5040        if annovar_databases_assembly != "" and not os.path.exists(
5041            annovar_databases_assembly
5042        ):
5043            os.makedirs(annovar_databases_assembly)
5044
5045        # Data
5046        table_variants = self.get_table_variants()
5047
5048        # Check if not empty
5049        log.debug("Check if not empty")
5050        sql_query_chromosomes = (
5051            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
5052        )
5053        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
5054        if not sql_query_chromosomes_df["count"][0]:
5055            log.info(f"VCF empty")
5056            return
5057
5058        # VCF header
5059        vcf_reader = self.get_header()
5060        log.debug("Initial header: " + str(vcf_reader.infos))
5061
5062        # Existing annotations
5063        for vcf_annotation in self.get_header().infos:
5064
5065            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
5066            log.debug(
5067                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
5068            )
5069
5070        force_update_annotation = True
5071
5072        if annotations:
5073
5074            commands = []
5075            tmp_annotates_vcf_name_list = []
5076
5077            # Export in VCF
5078            log.debug("Create initial file to annotate")
5079            tmp_vcf = NamedTemporaryFile(
5080                prefix=self.get_prefix(),
5081                dir=self.get_tmp_dir(),
5082                suffix=".vcf.gz",
5083                delete=False,
5084            )
5085            tmp_vcf_name = tmp_vcf.name
5086            tmp_files.append(tmp_vcf_name)
5087            tmp_files.append(tmp_vcf_name + ".tbi")
5088
5089            # Export VCF file
5090            self.export_variant_vcf(
5091                vcf_file=tmp_vcf_name,
5092                remove_info=".",
5093                add_samples=False,
5094                index=True,
5095            )
5096
5097            # Create file for field rename
5098            log.debug("Create file for field rename")
5099            tmp_rename = NamedTemporaryFile(
5100                prefix=self.get_prefix(),
5101                dir=self.get_tmp_dir(),
5102                suffix=".rename",
5103                delete=False,
5104            )
5105            tmp_rename_name = tmp_rename.name
5106            tmp_files.append(tmp_rename_name)
5107
5108            # Check Annovar database
5109            log.debug(
5110                f"Check Annovar databases {[assembly]}: {list(annotations.keys())}"
5111            )
5112            databases_download_annovar(
5113                folder=annovar_databases,
5114                files=list(annotations.keys()),
5115                assemblies=[assembly],
5116            )
5117
5118            for annotation in annotations:
5119                annotation_fields = annotations[annotation]
5120
5121                if not annotation_fields:
5122                    annotation_fields = {"INFO": None}
5123
5124                log.info(f"Annotations Annovar - database '{annotation}'")
5125                log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}")
5126
5127                # Tmp file for annovar
5128                err_files = []
5129                tmp_annotate_vcf_directory = TemporaryDirectory(
5130                    prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar"
5131                )
5132                tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar"
5133                tmp_annotate_vcf_name_annovar = (
5134                    tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf"
5135                )
5136                tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err"
5137                err_files.append(tmp_annotate_vcf_name_err)
5138                tmp_files.append(tmp_annotate_vcf_name_err)
5139
5140                # Tmp file final vcf annotated by annovar
5141                tmp_annotate_vcf = NamedTemporaryFile(
5142                    prefix=self.get_prefix(),
5143                    dir=self.get_tmp_dir(),
5144                    suffix=".vcf.gz",
5145                    delete=False,
5146                )
5147                tmp_annotate_vcf_name = tmp_annotate_vcf.name
5148                tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name)
5149                tmp_files.append(tmp_annotate_vcf_name)
5150                tmp_files.append(tmp_annotate_vcf_name + ".tbi")
5151
5152                # Number of fields
5153                annotation_list = []
5154                annotation_renamed_list = []
5155
5156                for annotation_field in annotation_fields:
5157
5158                    # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
5159                    annotation_fields_new_name = annotation_fields.get(
5160                        annotation_field, annotation_field
5161                    )
5162                    if not annotation_fields_new_name:
5163                        annotation_fields_new_name = annotation_field
5164
5165                    if (
5166                        force_update_annotation
5167                        or annotation_fields_new_name not in self.get_header().infos
5168                    ):
5169                        annotation_list.append(annotation_field)
5170                        annotation_renamed_list.append(annotation_fields_new_name)
5171                    else:  # annotation_fields_new_name in self.get_header().infos and not force_update_annotation:
5172                        log.warning(
5173                            f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
5174                        )
5175
5176                    # Add rename info
5177                    run_parallel_commands(
5178                        [
5179                            f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}"
5180                        ],
5181                        1,
5182                    )
5183
5184                # log.debug("fields_to_removed: " + str(fields_to_removed))
5185                log.debug("annotation_list: " + str(annotation_list))
5186
5187                # protocol
5188                protocol = annotation
5189
5190                # argument
5191                argument = ""
5192
5193                # operation
5194                operation = "f"
5195                if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith(
5196                    "ensGene"
5197                ):
5198                    operation = "g"
5199                    if options.get("genebase", None):
5200                        argument = f"""'{options.get("genebase","")}'"""
5201                elif annotation in ["cytoBand"]:
5202                    operation = "r"
5203
5204                # argument option
5205                argument_option = ""
5206                if argument != "":
5207                    argument_option = " --argument " + argument
5208
5209                # command options
5210                command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """  # --intronhgvs 10
5211                for option in options:
5212                    if option not in ["genebase"]:
5213                        command_options += f""" --{option}={options[option]}"""
5214
5215                # Command
5216
5217                # Command - Annovar
5218                command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """
5219                tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf")
5220
5221                # Command - start pipe
5222                command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """
5223
5224                # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!)
5225                command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """
5226
5227                # Command - Special characters (refGene annotation)
5228                command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """
5229
5230                # Command - Clean empty fields (with value ".")
5231                command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """
5232
5233                # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file
5234                annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"]
5235                if "ALL" not in annotation_list and "INFO" not in annotation_list:
5236                    # for ann in annotation_renamed_list:
5237                    for ann in annotation_list:
5238                        annovar_fields_to_keep.append(f"^INFO/{ann}")
5239
5240                command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """
5241
5242                # Command - indexing
5243                command_annovar += f"""  && tabix {tmp_annotate_vcf_name} """
5244
5245                log.debug(f"Annotation - Annovar command: {command_annovar}")
5246                run_parallel_commands([command_annovar], 1)
5247
5248                # Error messages
5249                log.info(f"Error/Warning messages:")
5250                error_message_command_all = []
5251                error_message_command_warning = []
5252                error_message_command_err = []
5253                for err_file in err_files:
5254                    with open(err_file, "r") as f:
5255                        for line in f:
5256                            message = line.strip()
5257                            error_message_command_all.append(message)
5258                            if line.startswith("[W::") or line.startswith("WARNING"):
5259                                error_message_command_warning.append(message)
5260                            if line.startswith("[E::") or line.startswith("ERROR"):
5261                                error_message_command_err.append(
5262                                    f"{err_file}: " + message
5263                                )
5264                # log info
5265                for message in list(
5266                    set(error_message_command_err + error_message_command_warning)
5267                ):
5268                    log.info(f"   {message}")
5269                # debug info
5270                for message in list(set(error_message_command_all)):
5271                    log.debug(f"   {message}")
5272                # failed
5273                if len(error_message_command_err):
5274                    log.error("Annotation failed: Error in commands")
5275                    raise ValueError("Annotation failed: Error in commands")
5276
5277            if tmp_annotates_vcf_name_list:
5278
5279                # List of annotated files
5280                tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list)
5281
5282                # Tmp file
5283                tmp_annotate_vcf = NamedTemporaryFile(
5284                    prefix=self.get_prefix(),
5285                    dir=self.get_tmp_dir(),
5286                    suffix=".vcf.gz",
5287                    delete=False,
5288                )
5289                tmp_annotate_vcf_name = tmp_annotate_vcf.name
5290                tmp_files.append(tmp_annotate_vcf_name)
5291                tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
5292                err_files.append(tmp_annotate_vcf_name_err)
5293                tmp_files.append(tmp_annotate_vcf_name_err)
5294
5295                # Command merge
5296                merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} "
5297                log.info(
5298                    f"Annotation Annovar - Annotation merging "
5299                    + str(len(tmp_annotates_vcf_name_list))
5300                    + " annotated files"
5301                )
5302                log.debug(f"Annotation - merge command: {merge_command}")
5303                run_parallel_commands([merge_command], 1)
5304
5305                # Find annotation in header
5306                with bgzf.open(tmp_annotate_vcf_name, "rt") as f:
5307                    header_list = self.read_vcf_header(f)
5308                annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
5309
5310                for ann in annovar_vcf_header.infos:
5311                    if ann not in self.get_header().infos:
5312                        vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
5313
5314                # Update variants
5315                log.info(f"Annotation Annovar - Updating...")
5316                self.update_from_vcf(tmp_annotate_vcf_name)
5317
5318            # Clean files
5319            # Tmp file remove command
5320            if True:
5321                tmp_files_remove_command = ""
5322                if tmp_files:
5323                    tmp_files_remove_command = " ".join(tmp_files)
5324                clean_command = f" rm -f {tmp_files_remove_command} "
5325                log.debug(f"Annotation Annovar - Annotation cleaning ")
5326                log.debug(f"Annotation - cleaning command: {clean_command}")
5327                run_parallel_commands([clean_command], 1)
5328
5329    # Parquet
5330    def annotation_parquet(self, threads: int = None) -> None:
5331        """
5332        It takes a VCF file, and annotates it with a parquet file
5333
5334        :param threads: number of threads to use for the annotation
5335        :return: the value of the variable "result".
5336        """
5337
5338        # DEBUG
5339        log.debug("Start annotation with parquet databases")
5340
5341        # Threads
5342        if not threads:
5343            threads = self.get_threads()
5344        log.debug("Threads: " + str(threads))
5345
5346        # DEBUG
5347        delete_tmp = True
5348        if self.get_config().get("verbosity", "warning") in ["debug"]:
5349            delete_tmp = False
5350            log.debug("Delete tmp files/folders: " + str(delete_tmp))
5351
5352        # Config
5353        databases_folders = set(
5354            self.get_config()
5355            .get("folders", {})
5356            .get("databases", {})
5357            .get("annotations", ["."])
5358            + self.get_config()
5359            .get("folders", {})
5360            .get("databases", {})
5361            .get("parquet", ["."])
5362        )
5363        log.debug("Databases annotations: " + str(databases_folders))
5364
5365        # Param
5366        annotations = (
5367            self.get_param()
5368            .get("annotation", {})
5369            .get("parquet", {})
5370            .get("annotations", None)
5371        )
5372        log.debug("Annotations: " + str(annotations))
5373
5374        # Assembly
5375        assembly = self.get_param().get(
5376            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
5377        )
5378
5379        # Force Update Annotation
5380        force_update_annotation = (
5381            self.get_param()
5382            .get("annotation", {})
5383            .get("options", {})
5384            .get("annotations_update", False)
5385        )
5386        log.debug(f"force_update_annotation={force_update_annotation}")
5387        force_append_annotation = (
5388            self.get_param()
5389            .get("annotation", {})
5390            .get("options", {})
5391            .get("annotations_append", False)
5392        )
5393        log.debug(f"force_append_annotation={force_append_annotation}")
5394
5395        # Data
5396        table_variants = self.get_table_variants()
5397
5398        # Check if not empty
5399        log.debug("Check if not empty")
5400        sql_query_chromosomes_df = self.get_query_to_df(
5401            f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1"""
5402        )
5403        if not sql_query_chromosomes_df["count"][0]:
5404            log.info(f"VCF empty")
5405            return
5406
5407        # VCF header
5408        vcf_reader = self.get_header()
5409        log.debug("Initial header: " + str(vcf_reader.infos))
5410
5411        # Nb Variants POS
5412        log.debug("NB Variants Start")
5413        nb_variants = self.conn.execute(
5414            f"SELECT count(*) AS count FROM variants"
5415        ).fetchdf()["count"][0]
5416        log.debug("NB Variants Stop")
5417
5418        # Existing annotations
5419        for vcf_annotation in self.get_header().infos:
5420
5421            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
5422            log.debug(
5423                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
5424            )
5425
5426        # Added columns
5427        added_columns = []
5428
5429        # drop indexes
5430        log.debug(f"Drop indexes...")
5431        self.drop_indexes()
5432
5433        if annotations:
5434
5435            if "ALL" in annotations:
5436
5437                all_param = annotations.get("ALL", {})
5438                all_param_formats = all_param.get("formats", None)
5439                all_param_releases = all_param.get("releases", None)
5440
5441                databases_infos_dict = self.scan_databases(
5442                    database_formats=all_param_formats,
5443                    database_releases=all_param_releases,
5444                )
5445                for database_infos in databases_infos_dict.keys():
5446                    if database_infos not in annotations:
5447                        annotations[database_infos] = {"INFO": None}
5448
5449            for annotation in annotations:
5450
5451                if annotation in ["ALL"]:
5452                    continue
5453
5454                # Annotation Name
5455                annotation_name = os.path.basename(annotation)
5456
5457                # Annotation fields
5458                annotation_fields = annotations[annotation]
5459                if not annotation_fields:
5460                    annotation_fields = {"INFO": None}
5461
5462                log.debug(f"Annotation '{annotation_name}'")
5463                log.debug(
5464                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
5465                )
5466
5467                # Create Database
5468                database = Database(
5469                    database=annotation,
5470                    databases_folders=databases_folders,
5471                    assembly=assembly,
5472                )
5473
5474                # Find files
5475                parquet_file = database.get_database()
5476                parquet_hdr_file = database.get_header_file()
5477                parquet_type = database.get_type()
5478
5479                # Check if files exists
5480                if not parquet_file or not parquet_hdr_file:
5481                    log.error("Annotation failed: file not found")
5482                    raise ValueError("Annotation failed: file not found")
5483                else:
5484                    # Get parquet connexion
5485                    parquet_sql_attach = database.get_sql_database_attach(
5486                        output="query"
5487                    )
5488                    if parquet_sql_attach:
5489                        self.conn.execute(parquet_sql_attach)
5490                    parquet_file_link = database.get_sql_database_link()
5491                    # Log
5492                    log.debug(
5493                        f"Annotation '{annotation_name}' - file: "
5494                        + str(parquet_file)
5495                        + " and "
5496                        + str(parquet_hdr_file)
5497                    )
5498
5499                    # Database full header columns
5500                    parquet_hdr_vcf_header_columns = database.get_header_file_columns(
5501                        parquet_hdr_file
5502                    )
5503                    # Log
5504                    log.debug(
5505                        "Annotation database header columns : "
5506                        + str(parquet_hdr_vcf_header_columns)
5507                    )
5508
5509                    # Load header as VCF object
5510                    parquet_hdr_vcf_header_infos = database.get_header().infos
5511                    # Log
5512                    log.debug(
5513                        "Annotation database header: "
5514                        + str(parquet_hdr_vcf_header_infos)
5515                    )
5516
5517                    # Get extra infos
5518                    parquet_columns = database.get_extra_columns()
5519                    # Log
5520                    log.debug("Annotation database Columns: " + str(parquet_columns))
5521
5522                    # Add extra columns if "ALL" in annotation_fields
5523                    # if "ALL" in annotation_fields:
5524                    #     allow_add_extra_column = True
5525                    if "ALL" in annotation_fields and database.get_extra_columns():
5526                        for extra_column in database.get_extra_columns():
5527                            if (
5528                                extra_column not in annotation_fields
5529                                and extra_column.replace("INFO/", "")
5530                                not in parquet_hdr_vcf_header_infos
5531                            ):
5532                                parquet_hdr_vcf_header_infos[extra_column] = (
5533                                    vcf.parser._Info(
5534                                        extra_column,
5535                                        ".",
5536                                        "String",
5537                                        f"{extra_column} description",
5538                                        "unknown",
5539                                        "unknown",
5540                                        self.code_type_map["String"],
5541                                    )
5542                                )
5543
5544                    # For all fields in database
5545                    annotation_fields_all = False
5546                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
5547                        annotation_fields_all = True
5548                        annotation_fields = {
5549                            key: key for key in parquet_hdr_vcf_header_infos
5550                        }
5551
5552                        log.debug(
5553                            "Annotation database header - All annotations added: "
5554                            + str(annotation_fields)
5555                        )
5556
5557                    # Init
5558
5559                    # List of annotation fields to use
5560                    sql_query_annotation_update_info_sets = []
5561
5562                    # List of annotation to agregate
5563                    sql_query_annotation_to_agregate = []
5564
5565                    # Number of fields
5566                    nb_annotation_field = 0
5567
5568                    # Annotation fields processed
5569                    annotation_fields_processed = []
5570
5571                    # Columns mapping
5572                    map_columns = database.map_columns(
5573                        columns=annotation_fields, prefixes=["INFO/"]
5574                    )
5575
5576                    # Query dict for fields to remove (update option)
5577                    query_dict_remove = {}
5578
5579                    # Fetch Anotation fields
5580                    for annotation_field in annotation_fields:
5581
5582                        # annotation_field_column
5583                        annotation_field_column = map_columns.get(
5584                            annotation_field, "INFO"
5585                        )
5586
5587                        # field new name, if parametered
5588                        annotation_fields_new_name = annotation_fields.get(
5589                            annotation_field, annotation_field
5590                        )
5591                        if not annotation_fields_new_name:
5592                            annotation_fields_new_name = annotation_field
5593
5594                        # To annotate
5595                        # force_update_annotation = True
5596                        # force_append_annotation = True
5597                        # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)):
5598                        if annotation_field in parquet_hdr_vcf_header_infos and (
5599                            force_update_annotation
5600                            or force_append_annotation
5601                            or (
5602                                annotation_fields_new_name
5603                                not in self.get_header().infos
5604                            )
5605                        ):
5606
5607                            # Add field to annotation to process list
5608                            annotation_fields_processed.append(
5609                                annotation_fields_new_name
5610                            )
5611
5612                            # explode infos for the field
5613                            annotation_fields_new_name_info_msg = ""
5614                            if (
5615                                force_update_annotation
5616                                and annotation_fields_new_name
5617                                in self.get_header().infos
5618                            ):
5619                                # Remove field from INFO
5620                                query = f"""
5621                                    UPDATE {table_variants} as table_variants
5622                                    SET INFO = REGEXP_REPLACE(
5623                                                concat(table_variants.INFO,''),
5624                                                ';*{annotation_fields_new_name}=[^;]*',
5625                                                ''
5626                                                )
5627                                    WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%'
5628                                """
5629                                annotation_fields_new_name_info_msg = " [update]"
5630                                query_dict_remove[
5631                                    f"remove 'INFO/{annotation_fields_new_name}'"
5632                                ] = query
5633
5634                            # Sep between fields in INFO
5635                            nb_annotation_field += 1
5636                            if nb_annotation_field > 1:
5637                                annotation_field_sep = ";"
5638                            else:
5639                                annotation_field_sep = ""
5640
5641                            log.info(
5642                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}"
5643                            )
5644
5645                            # Add INFO field to header
5646                            parquet_hdr_vcf_header_infos_number = (
5647                                parquet_hdr_vcf_header_infos[annotation_field].num
5648                                or "."
5649                            )
5650                            parquet_hdr_vcf_header_infos_type = (
5651                                parquet_hdr_vcf_header_infos[annotation_field].type
5652                                or "String"
5653                            )
5654                            parquet_hdr_vcf_header_infos_description = (
5655                                parquet_hdr_vcf_header_infos[annotation_field].desc
5656                                or f"{annotation_field} description"
5657                            )
5658                            parquet_hdr_vcf_header_infos_source = (
5659                                parquet_hdr_vcf_header_infos[annotation_field].source
5660                                or "unknown"
5661                            )
5662                            parquet_hdr_vcf_header_infos_version = (
5663                                parquet_hdr_vcf_header_infos[annotation_field].version
5664                                or "unknown"
5665                            )
5666
5667                            vcf_reader.infos[annotation_fields_new_name] = (
5668                                vcf.parser._Info(
5669                                    annotation_fields_new_name,
5670                                    parquet_hdr_vcf_header_infos_number,
5671                                    parquet_hdr_vcf_header_infos_type,
5672                                    parquet_hdr_vcf_header_infos_description,
5673                                    parquet_hdr_vcf_header_infos_source,
5674                                    parquet_hdr_vcf_header_infos_version,
5675                                    self.code_type_map[
5676                                        parquet_hdr_vcf_header_infos_type
5677                                    ],
5678                                )
5679                            )
5680
5681                            # Append
5682                            if force_append_annotation:
5683                                query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """
5684                            else:
5685                                query_case_when_append = ""
5686
5687                            # Annotation/Update query fields
5688                            # Found in INFO column
5689                            if (
5690                                annotation_field_column == "INFO"
5691                                and "INFO" in parquet_hdr_vcf_header_columns
5692                            ):
5693                                sql_query_annotation_update_info_sets.append(
5694                                    f"""
5695                                CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append}
5696                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1))
5697                                        ELSE ''
5698                                    END
5699                                """
5700                                )
5701                            # Found in a specific column
5702                            else:
5703                                sql_query_annotation_update_info_sets.append(
5704                                    f"""
5705                                CASE WHEN table_parquet."{annotation_field_column}" NOT IN ('','.') {query_case_when_append}
5706                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(table_parquet."{annotation_field_column}", ';', ','))
5707                                        ELSE ''
5708                                    END
5709                                """
5710                                )
5711                                sql_query_annotation_to_agregate.append(
5712                                    f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """
5713                                )
5714
5715                        # Not to annotate
5716                        else:
5717
5718                            if force_update_annotation:
5719                                annotation_message = "forced"
5720                            else:
5721                                annotation_message = "skipped"
5722
5723                            if annotation_field not in parquet_hdr_vcf_header_infos:
5724                                log.warning(
5725                                    f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file"
5726                                )
5727                            if annotation_fields_new_name in self.get_header().infos:
5728                                log.warning(
5729                                    f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})"
5730                                )
5731
5732                    # Check if ALL fields have to be annotated. Thus concat all INFO field
5733                    # allow_annotation_full_info = True
5734                    allow_annotation_full_info = not force_append_annotation
5735
5736                    if parquet_type in ["regions"]:
5737                        allow_annotation_full_info = False
5738
5739                    if (
5740                        allow_annotation_full_info
5741                        and nb_annotation_field == len(annotation_fields)
5742                        and annotation_fields_all
5743                        and (
5744                            "INFO" in parquet_hdr_vcf_header_columns
5745                            and "INFO" in database.get_extra_columns()
5746                        )
5747                    ):
5748                        log.debug("Column INFO annotation enabled")
5749                        sql_query_annotation_update_info_sets = []
5750                        sql_query_annotation_update_info_sets.append(
5751                            f" table_parquet.INFO "
5752                        )
5753
5754                    if sql_query_annotation_update_info_sets:
5755
5756                        # Annotate
5757                        log.info(f"Annotation '{annotation_name}' - Annotation...")
5758
5759                        # Join query annotation update info sets for SQL
5760                        sql_query_annotation_update_info_sets_sql = ",".join(
5761                            sql_query_annotation_update_info_sets
5762                        )
5763
5764                        # Check chromosomes list (and variants infos)
5765                        sql_query_chromosomes = f"""
5766                            SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants
5767                            FROM {table_variants} as table_variants
5768                            GROUP BY table_variants."#CHROM"
5769                            ORDER BY table_variants."#CHROM"
5770                            """
5771                        sql_query_chromosomes_df = self.conn.execute(
5772                            sql_query_chromosomes
5773                        ).df()
5774                        sql_query_chromosomes_dict = {
5775                            entry["CHROM"]: {
5776                                "count": entry["count_variants"],
5777                                "min": entry["min_variants"],
5778                                "max": entry["max_variants"],
5779                            }
5780                            for index, entry in sql_query_chromosomes_df.iterrows()
5781                        }
5782
5783                        # Init
5784                        nb_of_query = 0
5785                        nb_of_variant_annotated = 0
5786                        query_dict = query_dict_remove
5787
5788                        # for chrom in sql_query_chromosomes_df["CHROM"]:
5789                        for chrom in sql_query_chromosomes_dict:
5790
5791                            # Number of variant by chromosome
5792                            nb_of_variant_by_chrom = sql_query_chromosomes_dict.get(
5793                                chrom, {}
5794                            ).get("count", 0)
5795
5796                            log.debug(
5797                                f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..."
5798                            )
5799
5800                            # Annotation with regions database
5801                            if parquet_type in ["regions"]:
5802                                sql_query_annotation_from_clause = f"""
5803                                    FROM (
5804                                        SELECT 
5805                                            '{chrom}' AS \"#CHROM\",
5806                                            table_variants_from.\"POS\" AS \"POS\",
5807                                            {",".join(sql_query_annotation_to_agregate)}
5808                                        FROM {table_variants} as table_variants_from
5809                                        LEFT JOIN {parquet_file_link} as table_parquet_from ON (
5810                                            table_parquet_from."#CHROM" = '{chrom}'
5811                                            AND table_variants_from.\"POS\" <= table_parquet_from.\"END\"
5812                                            AND (table_variants_from.\"POS\" >= (table_parquet_from.\"START\"+1)
5813                                                OR table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1)
5814                                                )
5815                                        )
5816                                        WHERE table_variants_from.\"#CHROM\" in ('{chrom}')
5817                                        GROUP BY table_variants_from.\"POS\"
5818                                        )
5819                                        as table_parquet
5820                                """
5821
5822                                sql_query_annotation_where_clause = """
5823                                    table_parquet.\"#CHROM\" = table_variants.\"#CHROM\"
5824                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
5825                                """
5826
5827                            # Annotation with variants database
5828                            else:
5829                                sql_query_annotation_from_clause = f"""
5830                                    FROM {parquet_file_link} as table_parquet
5831                                """
5832                                sql_query_annotation_where_clause = f"""
5833                                    table_variants."#CHROM" = '{chrom}'
5834                                    AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 
5835                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
5836                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
5837                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
5838                                """
5839
5840                            # Create update query
5841                            sql_query_annotation_chrom_interval_pos = f"""
5842                                UPDATE {table_variants} as table_variants
5843                                    SET INFO = 
5844                                        concat(
5845                                            CASE WHEN table_variants.INFO NOT IN ('','.')
5846                                                THEN table_variants.INFO
5847                                                ELSE ''
5848                                            END
5849                                            ,
5850                                            CASE WHEN table_variants.INFO NOT IN ('','.')
5851                                                        AND (
5852                                                        concat({sql_query_annotation_update_info_sets_sql})
5853                                                        )
5854                                                        NOT IN ('','.') 
5855                                                    THEN ';'
5856                                                    ELSE ''
5857                                            END
5858                                            ,
5859                                            {sql_query_annotation_update_info_sets_sql}
5860                                            )
5861                                    {sql_query_annotation_from_clause}
5862                                    WHERE {sql_query_annotation_where_clause}
5863                                    ;
5864                                """
5865
5866                            # Add update query to dict
5867                            query_dict[
5868                                f"{chrom} [{nb_of_variant_by_chrom} variants]"
5869                            ] = sql_query_annotation_chrom_interval_pos
5870
5871                        nb_of_query = len(query_dict)
5872                        num_query = 0
5873
5874                        # SET max_expression_depth TO x
5875                        self.conn.execute("SET max_expression_depth TO 10000")
5876
5877                        for query_name in query_dict:
5878                            query = query_dict[query_name]
5879                            num_query += 1
5880                            log.info(
5881                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..."
5882                            )
5883                            result = self.conn.execute(query)
5884                            nb_of_variant_annotated_by_query = result.df()["Count"][0]
5885                            nb_of_variant_annotated += nb_of_variant_annotated_by_query
5886                            log.info(
5887                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated"
5888                            )
5889
5890                        log.info(
5891                            f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)"
5892                        )
5893
5894                    else:
5895
5896                        log.info(
5897                            f"Annotation '{annotation_name}' - No Annotations available"
5898                        )
5899
5900                    log.debug("Final header: " + str(vcf_reader.infos))
5901
5902        # Remove added columns
5903        for added_column in added_columns:
5904            self.drop_column(column=added_column)
5905
5906    def annotation_splice(self, threads: int = None) -> None:
5907        """
5908        This function annotate with snpEff
5909
5910        :param threads: The number of threads to use
5911        :return: the value of the variable "return_value".
5912        """
5913
5914        # DEBUG
5915        log.debug("Start annotation with splice tools")
5916
5917        # Threads
5918        if not threads:
5919            threads = self.get_threads()
5920        log.debug("Threads: " + str(threads))
5921
5922        # DEBUG
5923        delete_tmp = True
5924        if self.get_config().get("verbosity", "warning") in ["debug"]:
5925            delete_tmp = False
5926            log.debug("Delete tmp files/folders: " + str(delete_tmp))
5927
5928        # Config
5929        config = self.get_config()
5930        log.debug("Config: " + str(config))
5931        splice_config = config.get("tools", {}).get("splice", {})
5932        if not splice_config:
5933            splice_config = DEFAULT_TOOLS_BIN.get("splice", {})
5934        if not splice_config:
5935            msg_err = "No Splice tool config"
5936            log.error(msg_err)
5937            raise ValueError(msg_err)
5938        log.debug(f"splice_config={splice_config}")
5939
5940        # Config - Folders - Databases
5941        databases_folders = (
5942            config.get("folders", {}).get("databases", {}).get("splice", ["."])
5943        )
5944        log.debug("Databases annotations: " + str(databases_folders))
5945
5946        # Splice docker image
5947        splice_docker_image = splice_config.get("docker").get("image")
5948
5949        # Pull splice image if it's not already there
5950        if not check_docker_image_exists(splice_docker_image):
5951            log.warning(
5952                f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub"
5953            )
5954            try:
5955                command(f"docker pull {splice_config.get('docker').get('image')}")
5956            except subprocess.CalledProcessError:
5957                msg_err = f"Unable to find docker {splice_docker_image} on dockerhub"
5958                log.error(msg_err)
5959                raise ValueError(msg_err)
5960                return None
5961
5962        # Config - splice databases
5963        splice_databases = (
5964            config.get("folders", {})
5965            .get("databases", {})
5966            .get("splice", DEFAULT_SPLICE_FOLDER)
5967        )
5968        splice_databases = full_path(splice_databases)
5969
5970        # Param
5971        param = self.get_param()
5972        log.debug("Param: " + str(param))
5973
5974        # Param
5975        options = param.get("annotation", {}).get("splice", {})
5976        log.debug("Options: " + str(options))
5977
5978        # Data
5979        table_variants = self.get_table_variants()
5980
5981        # Check if not empty
5982        log.debug("Check if not empty")
5983        sql_query_chromosomes = (
5984            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
5985        )
5986        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
5987            log.info("VCF empty")
5988            return None
5989
5990        # Export in VCF
5991        log.debug("Create initial file to annotate")
5992
5993        # Create output folder
5994        output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}")
5995        if not os.path.exists(output_folder):
5996            Path(output_folder).mkdir(parents=True, exist_ok=True)
5997
5998        # Create tmp VCF file
5999        tmp_vcf = NamedTemporaryFile(
6000            prefix=self.get_prefix(),
6001            dir=output_folder,
6002            suffix=".vcf",
6003            delete=False,
6004        )
6005        tmp_vcf_name = tmp_vcf.name
6006
6007        # VCF header
6008        header = self.get_header()
6009
6010        # Existing annotations
6011        for vcf_annotation in self.get_header().infos:
6012
6013            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
6014            log.debug(
6015                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
6016            )
6017
6018        # Memory limit
6019        if config.get("memory", None):
6020            memory_limit = config.get("memory", "8G").upper()
6021            # upper()
6022        else:
6023            memory_limit = "8G"
6024        log.debug(f"memory_limit: {memory_limit}")
6025
6026        # Check number of variants to annotate
6027        where_clause_regex_spliceai = r"SpliceAI_\w+"
6028        where_clause_regex_spip = r"SPiP_\w+"
6029        where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')"""
6030        df_list_of_variants_to_annotate = self.get_query_to_df(
6031            query=f""" SELECT * FROM variants {where_clause} """
6032        )
6033        if len(df_list_of_variants_to_annotate) == 0:
6034            log.warning(
6035                f"No variants to annotate with splice. Variants probably already annotated with splice"
6036            )
6037            return None
6038        else:
6039            log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants")
6040
6041        # Export VCF file
6042        self.export_variant_vcf(
6043            vcf_file=tmp_vcf_name,
6044            remove_info=True,
6045            add_samples=True,
6046            index=False,
6047            where_clause=where_clause,
6048        )
6049
6050        # Create docker container and launch splice analysis
6051        if splice_config:
6052
6053            # Splice mount folders
6054            mount_folders = splice_config.get("mount", {})
6055
6056            # Genome mount
6057            mount_folders[
6058                config.get("folders", {})
6059                .get("databases", {})
6060                .get("genomes", DEFAULT_GENOME_FOLDER)
6061            ] = "ro"
6062
6063            # SpliceAI mount
6064            mount_folders[
6065                config.get("folders", {})
6066                .get("databases", {})
6067                .get("spliceai", DEFAULT_SPLICEAI_FOLDER)
6068            ] = "ro"
6069
6070            # Genome mount
6071            mount_folders[
6072                config.get("folders", {})
6073                .get("databases", {})
6074                .get("spip", DEFAULT_SPIP_FOLDER)
6075            ] = "ro"
6076
6077            # Mount folders
6078            mount = []
6079
6080            # Config mount
6081            mount = [
6082                f"-v {full_path(path)}:{full_path(path)}:{mode}"
6083                for path, mode in mount_folders.items()
6084            ]
6085
6086            if any(value for value in splice_config.values() if value is None):
6087                log.warning("At least one splice config parameter is empty")
6088                return None
6089
6090            # Params in splice nf
6091            def check_values(dico: dict):
6092                """
6093                Ensure parameters for NF splice pipeline
6094                """
6095                for key, val in dico.items():
6096                    if key == "genome":
6097                        if any(
6098                            assemb in options.get("genome", {})
6099                            for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"]
6100                        ):
6101                            yield f"--{key} hg19"
6102                        elif any(
6103                            assemb in options.get("genome", {})
6104                            for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"]
6105                        ):
6106                            yield f"--{key} hg38"
6107                    elif (
6108                        (isinstance(val, str) and val)
6109                        or isinstance(val, int)
6110                        or isinstance(val, bool)
6111                    ):
6112                        yield f"--{key} {val}"
6113
6114            # Genome
6115            genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY))
6116            options["genome"] = genome
6117
6118            # NF params
6119            nf_params = []
6120
6121            # Add options
6122            if options:
6123                nf_params = list(check_values(options))
6124                log.debug(f"Splice NF params: {' '.join(nf_params)}")
6125            else:
6126                log.debug("No NF params provided")
6127
6128            # Add threads
6129            if "threads" not in options.keys():
6130                nf_params.append(f"--threads {threads}")
6131
6132            # Genome path
6133            genome_path = find_genome(
6134                config.get("folders", {})
6135                .get("databases", {})
6136                .get("genomes", DEFAULT_GENOME_FOLDER),
6137                file=f"{genome}.fa",
6138            )
6139            # Add genome path
6140            if not genome_path:
6141                raise ValueError(
6142                    f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}"
6143                )
6144            else:
6145                log.debug(f"Genome: {genome_path}")
6146                nf_params.append(f"--genome_path {genome_path}")
6147
6148            def splice_annotations(options: dict = {}, config: dict = {}) -> list:
6149                """
6150                Setting up updated databases for SPiP and SpliceAI
6151                """
6152
6153                try:
6154
6155                    # SpliceAI assembly transcriptome
6156                    spliceai_assembly = os.path.join(
6157                        config.get("folders", {})
6158                        .get("databases", {})
6159                        .get("spliceai", {}),
6160                        options.get("genome"),
6161                        "transcriptome",
6162                    )
6163                    spip_assembly = options.get("genome")
6164
6165                    spip = find(
6166                        f"transcriptome_{spip_assembly}.RData",
6167                        config.get("folders", {}).get("databases", {}).get("spip", {}),
6168                    )
6169                    spliceai = find("spliceai.refseq.txt", spliceai_assembly)
6170                    log.debug(f"SPiP annotations: {spip}")
6171                    log.debug(f"SpliceAI annotations: {spliceai}")
6172                    if spip and spliceai:
6173                        return [
6174                            f"--spip_transcriptome {spip}",
6175                            f"--spliceai_annotations {spliceai}",
6176                        ]
6177                    else:
6178                        # TODO crash and go on with basic annotations ?
6179                        # raise ValueError(
6180                        #     "Can't find splice databases in configuration EXIT"
6181                        # )
6182                        log.warning(
6183                            "Can't find splice databases in configuration, use annotations file from image"
6184                        )
6185                except TypeError:
6186                    log.warning(
6187                        "Can't find splice databases in configuration, use annotations file from image"
6188                    )
6189                    return []
6190
6191            # Add options, check if transcriptome option have already beend provided
6192            if (
6193                "spip_transcriptome" not in nf_params
6194                and "spliceai_transcriptome" not in nf_params
6195            ):
6196                splice_reference = splice_annotations(options, config)
6197                if splice_reference:
6198                    nf_params.extend(splice_reference)
6199
6200            nf_params.append(f"--output_folder {output_folder}")
6201
6202            random_uuid = f"HOWARD-SPLICE-{get_random()}"
6203            cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline"
6204            log.debug(cmd)
6205
6206            splice_config["docker"]["command"] = cmd
6207
6208            docker_cmd = get_bin_command(
6209                tool="splice",
6210                bin_type="docker",
6211                config=config,
6212                default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker",
6213                add_options=f"--name {random_uuid} {' '.join(mount)}",
6214            )
6215
6216            # Docker debug
6217            # if splice_config.get("rm_container"):
6218            #     rm_container = "--rm"
6219            # else:
6220            #     rm_container = ""
6221            # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}"
6222
6223            log.debug(docker_cmd)
6224            res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True)
6225            log.debug(res.stdout)
6226            if res.stderr:
6227                log.error(res.stderr)
6228            res.check_returncode()
6229        else:
6230            log.warning(f"Splice tool configuration not found: {config}")
6231
6232        # Update variants
6233        log.info("Annotation - Updating...")
6234        # Test find output vcf
6235        log.debug(
6236            f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
6237        )
6238        output_vcf = []
6239        # Wrong folder to look in
6240        for files in os.listdir(os.path.dirname(tmp_vcf_name)):
6241            if (
6242                files
6243                == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
6244            ):
6245                output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files))
6246        # log.debug(os.listdir(options.get("output_folder")))
6247        log.debug(f"Splice annotated vcf: {output_vcf[0]}")
6248        if not output_vcf:
6249            log.debug(
6250                f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz"
6251            )
6252        else:
6253            # Get new header from annotated vcf
6254            log.debug(f"Initial header: {len(header.infos)} fields")
6255            # Create new header with splice infos
6256            new_vcf = Variants(input=output_vcf[0])
6257            new_vcf_header = new_vcf.get_header().infos
6258            for keys, infos in new_vcf_header.items():
6259                if keys not in header.infos.keys():
6260                    header.infos[keys] = infos
6261            log.debug(f"New header: {len(header.infos)} fields")
6262            log.debug(f"Splice tmp output: {output_vcf[0]}")
6263            self.update_from_vcf(output_vcf[0])
6264
6265        # Remove folder
6266        remove_if_exists(output_folder)
6267
6268    ###
6269    # Prioritization
6270    ###
6271
6272    def get_config_default(self, name: str) -> dict:
6273        """
6274        The function `get_config_default` returns a dictionary containing default configurations for
6275        various calculations and prioritizations.
6276
6277        :param name: The `get_config_default` function returns a dictionary containing default
6278        configurations for different calculations and prioritizations. The `name` parameter is used to
6279        specify which specific configuration to retrieve from the dictionary
6280        :type name: str
6281        :return: The function `get_config_default` returns a dictionary containing default configuration
6282        settings for different calculations and prioritizations. The specific configuration settings are
6283        retrieved based on the input `name` parameter provided to the function. If the `name` parameter
6284        matches a key in the `config_default` dictionary, the corresponding configuration settings are
6285        returned. If there is no match, an empty dictionary is returned.
6286        """
6287
6288        config_default = {
6289            "calculations": {
6290                "variant_chr_pos_alt_ref": {
6291                    "type": "sql",
6292                    "name": "variant_chr_pos_alt_ref",
6293                    "description": "Create a variant ID with chromosome, position, alt and ref",
6294                    "available": False,
6295                    "output_column_name": "variant_chr_pos_alt_ref",
6296                    "output_column_type": "String",
6297                    "output_column_description": "variant ID with chromosome, position, alt and ref",
6298                    "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """,
6299                    "operation_info": True,
6300                },
6301                "VARTYPE": {
6302                    "type": "sql",
6303                    "name": "VARTYPE",
6304                    "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)",
6305                    "available": True,
6306                    "output_column_name": "VARTYPE",
6307                    "output_column_type": "String",
6308                    "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ",
6309                    "operation_query": """
6310                            CASE
6311                                WHEN "SVTYPE" NOT NULL THEN "SVTYPE"
6312                                WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV'
6313                                WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC'
6314                                WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV'
6315                                WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL'
6316                                ELSE 'UNDEFINED'
6317                            END
6318                            """,
6319                    "info_fields": ["SVTYPE"],
6320                    "operation_info": True,
6321                },
6322                "snpeff_hgvs": {
6323                    "type": "python",
6324                    "name": "snpeff_hgvs",
6325                    "description": "HGVS nomenclatures from snpEff annotation",
6326                    "available": True,
6327                    "function_name": "calculation_extract_snpeff_hgvs",
6328                    "function_params": ["snpeff_hgvs", "ANN"],
6329                },
6330                "snpeff_ann_explode": {
6331                    "type": "python",
6332                    "name": "snpeff_ann_explode",
6333                    "description": "Explode snpEff annotations with uniquify values",
6334                    "available": True,
6335                    "function_name": "calculation_snpeff_ann_explode",
6336                    "function_params": [False, "fields", "snpeff_", "ANN"],
6337                },
6338                "snpeff_ann_explode_uniquify": {
6339                    "type": "python",
6340                    "name": "snpeff_ann_explode_uniquify",
6341                    "description": "Explode snpEff annotations",
6342                    "available": True,
6343                    "function_name": "calculation_snpeff_ann_explode",
6344                    "function_params": [True, "fields", "snpeff_uniquify_", "ANN"],
6345                },
6346                "snpeff_ann_explode_json": {
6347                    "type": "python",
6348                    "name": "snpeff_ann_explode_json",
6349                    "description": "Explode snpEff annotations in JSON format",
6350                    "available": True,
6351                    "function_name": "calculation_snpeff_ann_explode",
6352                    "function_params": [False, "JSON", "snpeff_json", "ANN"],
6353                },
6354                "NOMEN": {
6355                    "type": "python",
6356                    "name": "NOMEN",
6357                    "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field",
6358                    "available": True,
6359                    "function_name": "calculation_extract_nomen",
6360                    "function_params": [],
6361                },
6362                "FINDBYPIPELINE": {
6363                    "type": "python",
6364                    "name": "FINDBYPIPELINE",
6365                    "description": "Number of pipeline that identify the variant (for multi pipeline VCF)",
6366                    "available": True,
6367                    "function_name": "calculation_find_by_pipeline",
6368                    "function_params": ["findbypipeline"],
6369                },
6370                "FINDBYSAMPLE": {
6371                    "type": "python",
6372                    "name": "FINDBYSAMPLE",
6373                    "description": "Number of sample that have a genotype for the variant (for multi sample VCF)",
6374                    "available": True,
6375                    "function_name": "calculation_find_by_pipeline",
6376                    "function_params": ["findbysample"],
6377                },
6378                "GENOTYPECONCORDANCE": {
6379                    "type": "python",
6380                    "name": "GENOTYPECONCORDANCE",
6381                    "description": "Concordance of genotype for multi caller VCF",
6382                    "available": True,
6383                    "function_name": "calculation_genotype_concordance",
6384                    "function_params": [],
6385                },
6386                "BARCODE": {
6387                    "type": "python",
6388                    "name": "BARCODE",
6389                    "description": "BARCODE as VaRank tool",
6390                    "available": True,
6391                    "function_name": "calculation_barcode",
6392                    "function_params": [],
6393                },
6394                "BARCODEFAMILY": {
6395                    "type": "python",
6396                    "name": "BARCODEFAMILY",
6397                    "description": "BARCODEFAMILY as VaRank tool",
6398                    "available": True,
6399                    "function_name": "calculation_barcode_family",
6400                    "function_params": ["BCF"],
6401                },
6402                "TRIO": {
6403                    "type": "python",
6404                    "name": "TRIO",
6405                    "description": "Inheritance for a trio family",
6406                    "available": True,
6407                    "function_name": "calculation_trio",
6408                    "function_params": [],
6409                },
6410                "VAF": {
6411                    "type": "python",
6412                    "name": "VAF",
6413                    "description": "Variant Allele Frequency (VAF) harmonization",
6414                    "available": True,
6415                    "function_name": "calculation_vaf_normalization",
6416                    "function_params": [],
6417                },
6418                "VAF_stats": {
6419                    "type": "python",
6420                    "name": "VAF_stats",
6421                    "description": "Variant Allele Frequency (VAF) statistics",
6422                    "available": True,
6423                    "function_name": "calculation_genotype_stats",
6424                    "function_params": ["VAF"],
6425                },
6426                "DP_stats": {
6427                    "type": "python",
6428                    "name": "DP_stats",
6429                    "description": "Depth (DP) statistics",
6430                    "available": True,
6431                    "function_name": "calculation_genotype_stats",
6432                    "function_params": ["DP"],
6433                },
6434                "variant_id": {
6435                    "type": "python",
6436                    "name": "variant_id",
6437                    "description": "Variant ID generated from variant position and type",
6438                    "available": True,
6439                    "function_name": "calculation_variant_id",
6440                    "function_params": [],
6441                },
6442                "transcripts_json": {
6443                    "type": "python",
6444                    "name": "transcripts_json",
6445                    "description": "Add transcripts info in JSON format (field 'transcripts_json')",
6446                    "available": True,
6447                    "function_name": "calculation_transcripts_json",
6448                    "function_params": ["transcripts_json"],
6449                },
6450            },
6451            "prioritizations": {
6452                "default": {
6453                    "filter": [
6454                        {
6455                            "type": "notequals",
6456                            "value": "!PASS|\\.",
6457                            "score": 0,
6458                            "flag": "FILTERED",
6459                            "comment": ["Bad variant quality"],
6460                        },
6461                        {
6462                            "type": "equals",
6463                            "value": "REJECT",
6464                            "score": -20,
6465                            "flag": "PASS",
6466                            "comment": ["Bad variant quality"],
6467                        },
6468                    ],
6469                    "DP": [
6470                        {
6471                            "type": "gte",
6472                            "value": "50",
6473                            "score": 5,
6474                            "flag": "PASS",
6475                            "comment": ["DP higher than 50"],
6476                        }
6477                    ],
6478                    "ANN": [
6479                        {
6480                            "type": "contains",
6481                            "value": "HIGH",
6482                            "score": 5,
6483                            "flag": "PASS",
6484                            "comment": [
6485                                "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay"
6486                            ],
6487                        },
6488                        {
6489                            "type": "contains",
6490                            "value": "MODERATE",
6491                            "score": 3,
6492                            "flag": "PASS",
6493                            "comment": [
6494                                "A non-disruptive variant that might change protein effectiveness"
6495                            ],
6496                        },
6497                        {
6498                            "type": "contains",
6499                            "value": "LOW",
6500                            "score": 0,
6501                            "flag": "FILTERED",
6502                            "comment": [
6503                                "Assumed to be mostly harmless or unlikely to change protein behavior"
6504                            ],
6505                        },
6506                        {
6507                            "type": "contains",
6508                            "value": "MODIFIER",
6509                            "score": 0,
6510                            "flag": "FILTERED",
6511                            "comment": [
6512                                "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact"
6513                            ],
6514                        },
6515                    ],
6516                }
6517            },
6518        }
6519
6520        return config_default.get(name, None)
6521
6522    def get_config_json(
6523        self, name: str, config_dict: dict = {}, config_file: str = None
6524    ) -> dict:
6525        """
6526        The function `get_config_json` retrieves a configuration JSON object with prioritizations from
6527        default values, a dictionary, and a file.
6528
6529        :param name: The `name` parameter in the `get_config_json` function is a string that represents
6530        the name of the configuration. It is used to identify and retrieve the configuration settings
6531        for a specific component or module
6532        :type name: str
6533        :param config_dict: The `config_dict` parameter in the `get_config_json` function is a
6534        dictionary that allows you to provide additional configuration settings or overrides. When you
6535        call the `get_config_json` function, you can pass a dictionary containing key-value pairs where
6536        the key is the configuration setting you want to override or
6537        :type config_dict: dict
6538        :param config_file: The `config_file` parameter in the `get_config_json` function is used to
6539        specify the path to a configuration file that contains additional settings. If provided, the
6540        function will read the contents of this file and update the configuration dictionary with the
6541        values found in the file, overriding any existing values with the
6542        :type config_file: str
6543        :return: The function `get_config_json` returns a dictionary containing the configuration
6544        settings.
6545        """
6546
6547        # Create with default prioritizations
6548        config_default = self.get_config_default(name=name)
6549        configuration = config_default
6550        # log.debug(f"configuration={configuration}")
6551
6552        # Replace prioritizations from dict
6553        for config in config_dict:
6554            configuration[config] = config_dict[config]
6555
6556        # Replace prioritizations from file
6557        config_file = full_path(config_file)
6558        if config_file:
6559            if os.path.exists(config_file):
6560                with open(config_file) as config_file_content:
6561                    config_file_dict = json.load(config_file_content)
6562                for config in config_file_dict:
6563                    configuration[config] = config_file_dict[config]
6564            else:
6565                msg_error = f"Config '{name}' file '{config_file}' does NOT exist"
6566                log.error(msg_error)
6567                raise ValueError(msg_error)
6568
6569        return configuration
6570
6571    def prioritization(self) -> None:
6572        """
6573        It takes a VCF file, and adds a bunch of new INFO fields to it, based on the values of other
6574        INFO fields
6575        """
6576
6577        # Config
6578        config = self.get_config()
6579
6580        # Param
6581        param = self.get_param()
6582
6583        # Quick Prioritizations
6584        # prioritizations = param.get("prioritization", {}).get("prioritizations", "")
6585
6586        # Configuration profiles
6587        prioritization_config_file = param.get("prioritization", {}).get(
6588            "prioritization_config", None
6589        )
6590        prioritization_config_file = full_path(prioritization_config_file)
6591        prioritizations_config = self.get_config_json(
6592            name="prioritizations", config_file=prioritization_config_file
6593        )
6594
6595        # Prioritization options
6596        profiles = param.get("prioritization", {}).get("profiles", [])
6597        if isinstance(profiles, str):
6598            profiles = profiles.split(",")
6599        pzfields = param.get("prioritization", {}).get(
6600            "pzfields", ["PZFlag", "PZScore"]
6601        )
6602        if isinstance(pzfields, str):
6603            pzfields = pzfields.split(",")
6604        default_profile = param.get("prioritization", {}).get("default_profile", None)
6605        pzfields_sep = param.get("prioritization", {}).get("pzfields_sep", "_")
6606        prioritization_score_mode = param.get("prioritization", {}).get(
6607            "prioritization_score_mode", "HOWARD"
6608        )
6609
6610        # Quick Prioritizations
6611        # prioritizations = param.get("prioritization", {}).get("prioritizations", None)
6612        prioritizations = param.get("prioritizations", None)
6613        if prioritizations:
6614            log.info("Quick Prioritization:")
6615            for profile in prioritizations.split(","):
6616                if profile not in profiles:
6617                    profiles.append(profile)
6618                    log.info(f"   {profile}")
6619
6620        # If profile "ALL" provided, all profiles in the config profiles
6621        if "ALL" in profiles:
6622            profiles = list(prioritizations_config.keys())
6623
6624        for profile in profiles:
6625            if prioritizations_config.get(profile, None):
6626                log.debug(f"Profile '{profile}' configured")
6627            else:
6628                msg_error = f"Profile '{profile}' NOT configured"
6629                log.error(msg_error)
6630                raise ValueError(msg_error)
6631
6632        if profiles:
6633            log.info(f"Prioritization... ")
6634        else:
6635            log.debug(f"No profile defined")
6636            return
6637
6638        if not default_profile and len(profiles):
6639            default_profile = profiles[0]
6640
6641        log.debug("Profiles availables: " + str(list(prioritizations_config.keys())))
6642        log.debug("Profiles to check: " + str(list(profiles)))
6643
6644        # Variables
6645        table_variants = self.get_table_variants(clause="update")
6646
6647        # Added columns
6648        added_columns = []
6649
6650        # Create list of PZfields
6651        # List of PZFields
6652        list_of_pzfields_original = pzfields + [
6653            pzfield + pzfields_sep + profile
6654            for pzfield in pzfields
6655            for profile in profiles
6656        ]
6657        list_of_pzfields = []
6658        log.debug(f"{list_of_pzfields_original}")
6659
6660        # Remove existing PZfields to use if exists
6661        for pzfield in list_of_pzfields_original:
6662            if self.get_header().infos.get(pzfield, None) is None:
6663                list_of_pzfields.append(pzfield)
6664                log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF")
6665            else:
6666                log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF")
6667
6668        if list_of_pzfields:
6669
6670            # Explode Infos fields
6671            explode_infos_prefix = self.get_explode_infos_prefix()
6672            added_columns += self.explode_infos(prefix=explode_infos_prefix)
6673            extra_infos = self.get_extra_infos()
6674
6675            # PZfields tags description
6676            PZfields_INFOS = {
6677                "PZTags": {
6678                    "ID": "PZTags",
6679                    "Number": ".",
6680                    "Type": "String",
6681                    "Description": "Variant tags based on annotation criteria",
6682                },
6683                "PZScore": {
6684                    "ID": "PZScore",
6685                    "Number": 1,
6686                    "Type": "Integer",
6687                    "Description": "Variant score based on annotation criteria",
6688                },
6689                "PZFlag": {
6690                    "ID": "PZFlag",
6691                    "Number": 1,
6692                    "Type": "String",
6693                    "Description": "Variant flag based on annotation criteria",
6694                },
6695                "PZComment": {
6696                    "ID": "PZComment",
6697                    "Number": ".",
6698                    "Type": "String",
6699                    "Description": "Variant comment based on annotation criteria",
6700                },
6701                "PZInfos": {
6702                    "ID": "PZInfos",
6703                    "Number": ".",
6704                    "Type": "String",
6705                    "Description": "Variant infos based on annotation criteria",
6706                },
6707            }
6708
6709            # Create INFO fields if not exist
6710            for field in PZfields_INFOS:
6711                field_ID = PZfields_INFOS[field]["ID"]
6712                field_description = PZfields_INFOS[field]["Description"]
6713                if field_ID not in self.get_header().infos and field_ID in pzfields:
6714                    field_description = (
6715                        PZfields_INFOS[field]["Description"]
6716                        + f", profile {default_profile}"
6717                    )
6718                    self.get_header().infos[field_ID] = vcf.parser._Info(
6719                        field_ID,
6720                        PZfields_INFOS[field]["Number"],
6721                        PZfields_INFOS[field]["Type"],
6722                        field_description,
6723                        "unknown",
6724                        "unknown",
6725                        code_type_map[PZfields_INFOS[field]["Type"]],
6726                    )
6727
6728            # Create INFO fields if not exist for each profile
6729            for profile in prioritizations_config:
6730                if profile in profiles or profiles == []:
6731                    for field in PZfields_INFOS:
6732                        field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile
6733                        field_description = (
6734                            PZfields_INFOS[field]["Description"]
6735                            + f", profile {profile}"
6736                        )
6737                        if (
6738                            field_ID not in self.get_header().infos
6739                            and field in pzfields
6740                        ):
6741                            self.get_header().infos[field_ID] = vcf.parser._Info(
6742                                field_ID,
6743                                PZfields_INFOS[field]["Number"],
6744                                PZfields_INFOS[field]["Type"],
6745                                field_description,
6746                                "unknown",
6747                                "unknown",
6748                                code_type_map[PZfields_INFOS[field]["Type"]],
6749                            )
6750
6751            # Header
6752            for pzfield in list_of_pzfields:
6753                if re.match("PZScore.*", pzfield):
6754                    added_column = self.add_column(
6755                        table_name=table_variants,
6756                        column_name=pzfield,
6757                        column_type="INTEGER",
6758                        default_value="0",
6759                    )
6760                elif re.match("PZFlag.*", pzfield):
6761                    added_column = self.add_column(
6762                        table_name=table_variants,
6763                        column_name=pzfield,
6764                        column_type="BOOLEAN",
6765                        default_value="1",
6766                    )
6767                else:
6768                    added_column = self.add_column(
6769                        table_name=table_variants,
6770                        column_name=pzfield,
6771                        column_type="STRING",
6772                        default_value="''",
6773                    )
6774                added_columns.append(added_column)
6775
6776            # Profiles
6777            if profiles:
6778
6779                # foreach profile in configuration file
6780                for profile in prioritizations_config:
6781
6782                    # If profile is asked in param, or ALL are asked (empty profile [])
6783                    if profile in profiles or profiles == []:
6784                        log.info(f"Profile '{profile}'")
6785
6786                        sql_set_info_option = ""
6787
6788                        sql_set_info = []
6789
6790                        # PZ fields set
6791
6792                        # PZScore
6793                        if f"PZScore{pzfields_sep}{profile}" in list_of_pzfields:
6794                            sql_set_info.append(
6795                                f"""
6796                                    concat(
6797                                        'PZScore{pzfields_sep}{profile}=',
6798                                        PZScore{pzfields_sep}{profile}
6799                                    ) 
6800                                """
6801                            )
6802                            if (
6803                                profile == default_profile
6804                                and "PZScore" in list_of_pzfields
6805                            ):
6806                                sql_set_info.append(
6807                                    f"""
6808                                        concat(
6809                                            'PZScore=',
6810                                            PZScore{pzfields_sep}{profile}
6811                                        )
6812                                    """
6813                                )
6814
6815                        # PZFlag
6816                        if f"PZFlag{pzfields_sep}{profile}" in list_of_pzfields:
6817                            sql_set_info.append(
6818                                f"""
6819                                    concat(
6820                                        'PZFlag{pzfields_sep}{profile}=',
6821                                        CASE 
6822                                            WHEN PZFlag{pzfields_sep}{profile}==1
6823                                            THEN 'PASS'
6824                                            WHEN PZFlag{pzfields_sep}{profile}==0
6825                                            THEN 'FILTERED'
6826                                        END
6827                                    ) 
6828                                """
6829                            )
6830                            if (
6831                                profile == default_profile
6832                                and "PZFlag" in list_of_pzfields
6833                            ):
6834                                sql_set_info.append(
6835                                    f"""
6836                                        concat(
6837                                            'PZFlag=',
6838                                            CASE 
6839                                                WHEN PZFlag{pzfields_sep}{profile}==1
6840                                                THEN 'PASS'
6841                                                WHEN PZFlag{pzfields_sep}{profile}==0
6842                                                THEN 'FILTERED'
6843                                            END
6844                                        )
6845                                    """
6846                                )
6847
6848                        # PZComment
6849                        if f"PZComment{pzfields_sep}{profile}" in list_of_pzfields:
6850                            sql_set_info.append(
6851                                f"""
6852                                    CASE
6853                                        WHEN PZComment{pzfields_sep}{profile} NOT IN ('')
6854                                        THEN concat('PZComment{pzfields_sep}{profile}=', PZComment{pzfields_sep}{profile})
6855                                        ELSE ''
6856                                    END
6857                                """
6858                            )
6859                            if (
6860                                profile == default_profile
6861                                and "PZComment" in list_of_pzfields
6862                            ):
6863                                sql_set_info.append(
6864                                    f"""
6865                                        CASE
6866                                            WHEN PZComment{pzfields_sep}{profile} NOT IN ('')
6867                                            THEN concat('PZComment=', PZComment{pzfields_sep}{profile})
6868                                            ELSE ''
6869                                        END
6870                                    """
6871                                )
6872
6873                        # PZInfos
6874                        if f"PZInfos{pzfields_sep}{profile}" in list_of_pzfields:
6875                            sql_set_info.append(
6876                                f"""
6877                                    CASE
6878                                        WHEN PZInfos{pzfields_sep}{profile} NOT IN ('')
6879                                        THEN concat('PZInfos{pzfields_sep}{profile}=', PZInfos{pzfields_sep}{profile})
6880                                        ELSE ''
6881                                    END
6882                                """
6883                            )
6884                            if (
6885                                profile == default_profile
6886                                and "PZInfos" in list_of_pzfields
6887                            ):
6888                                sql_set_info.append(
6889                                    f"""
6890                                        CASE
6891                                            WHEN PZInfos{pzfields_sep}{profile} NOT IN ('')
6892                                            THEN concat('PZInfos=', PZInfos{pzfields_sep}{profile})
6893                                            ELSE ''
6894                                        END
6895                                    """
6896                                )
6897
6898                        # Merge PZfields
6899                        sql_set_info_option = ""
6900                        sql_set_sep = ""
6901                        for sql_set in sql_set_info:
6902                            if sql_set_sep:
6903                                sql_set_info_option += f"""
6904                                    , concat('{sql_set_sep}', {sql_set})
6905                                """
6906                            else:
6907                                sql_set_info_option += f"""
6908                                    , {sql_set}
6909                                """
6910                            sql_set_sep = ";"
6911
6912                        sql_queries = []
6913                        for annotation in prioritizations_config[profile]:
6914
6915                            # Check if annotation field is present
6916                            if not f"{explode_infos_prefix}{annotation}" in extra_infos:
6917                                log.debug(f"Annotation '{annotation}' not in data")
6918                                continue
6919                            else:
6920                                log.debug(f"Annotation '{annotation}' in data")
6921
6922                            # For each criterions
6923                            for criterion in prioritizations_config[profile][
6924                                annotation
6925                            ]:
6926                                criterion_type = criterion["type"]
6927                                criterion_value = criterion["value"]
6928                                criterion_score = criterion.get("score", 0)
6929                                criterion_flag = criterion.get("flag", "PASS")
6930                                criterion_flag_bool = criterion_flag == "PASS"
6931                                criterion_comment = (
6932                                    ", ".join(criterion.get("comment", []))
6933                                    .replace("'", "''")
6934                                    .replace(";", ",")
6935                                    .replace("\t", " ")
6936                                )
6937                                criterion_infos = (
6938                                    str(criterion)
6939                                    .replace("'", "''")
6940                                    .replace(";", ",")
6941                                    .replace("\t", " ")
6942                                )
6943
6944                                sql_set = []
6945                                sql_set_info = []
6946
6947                                # PZ fields set
6948                                if (
6949                                    f"PZScore{pzfields_sep}{profile}"
6950                                    in list_of_pzfields
6951                                ):
6952                                    if prioritization_score_mode == "HOWARD":
6953                                        sql_set.append(
6954                                            f"PZScore{pzfields_sep}{profile} = PZScore{pzfields_sep}{profile} + {criterion_score}"
6955                                        )
6956                                    elif prioritization_score_mode == "VaRank":
6957                                        sql_set.append(
6958                                            f"PZScore{pzfields_sep}{profile} = CASE WHEN {criterion_score}>PZScore{pzfields_sep}{profile} THEN {criterion_score} END"
6959                                        )
6960                                    else:
6961                                        sql_set.append(
6962                                            f"PZScore{pzfields_sep}{profile} = PZScore{pzfields_sep}{profile} + {criterion_score}"
6963                                        )
6964                                if f"PZFlag{pzfields_sep}{profile}" in list_of_pzfields:
6965                                    sql_set.append(
6966                                        f"PZFlag{pzfields_sep}{profile} = PZFlag{pzfields_sep}{profile} AND {criterion_flag_bool}"
6967                                    )
6968                                if (
6969                                    f"PZComment{pzfields_sep}{profile}"
6970                                    in list_of_pzfields
6971                                ):
6972                                    sql_set.append(
6973                                        f"""
6974                                            PZComment{pzfields_sep}{profile} = 
6975                                                concat(
6976                                                    PZComment{pzfields_sep}{profile},
6977                                                    CASE 
6978                                                        WHEN PZComment{pzfields_sep}{profile}!=''
6979                                                        THEN ', '
6980                                                        ELSE ''
6981                                                    END,
6982                                                    '{criterion_comment}'
6983                                                )
6984                                        """
6985                                    )
6986                                if (
6987                                    f"PZInfos{pzfields_sep}{profile}"
6988                                    in list_of_pzfields
6989                                ):
6990                                    sql_set.append(
6991                                        f"""
6992                                            PZInfos{pzfields_sep}{profile} = 
6993                                                concat(
6994                                                    PZInfos{pzfields_sep}{profile},
6995                                                    '{criterion_infos}'
6996                                                )
6997                                        """
6998                                    )
6999                                sql_set_option = ",".join(sql_set)
7000
7001                                # Criterion and comparison
7002                                try:
7003                                    float(criterion_value)
7004                                    sql_update = f"""
7005                                        UPDATE {table_variants}
7006                                        SET {sql_set_option}
7007                                        WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.')
7008                                        AND "{explode_infos_prefix}{annotation}"{comparison_map[criterion_type]}{criterion_value}
7009                                        """
7010                                except:
7011                                    contains_option = ""
7012                                    if criterion_type == "contains":
7013                                        contains_option = ".*"
7014                                    sql_update = f"""
7015                                        UPDATE {table_variants}
7016                                        SET {sql_set_option}
7017                                        WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}'
7018                                        """
7019                                sql_queries.append(sql_update)
7020
7021                        # PZTags
7022                        if f"PZTags{pzfields_sep}{profile}" in list_of_pzfields:
7023
7024                            # Create PZFalgs value
7025                            pztags_value = ""
7026                            pztags_sep_default = "|"
7027                            pztags_sep = ""
7028                            for pzfield in pzfields:
7029                                if pzfield not in ["PZTags"]:
7030                                    if (
7031                                        f"{pzfield}{pzfields_sep}{profile}"
7032                                        in list_of_pzfields
7033                                    ):
7034                                        if pzfield in ["PZFlag"]:
7035                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
7036                                                CASE WHEN PZFlag{pzfields_sep}{profile}
7037                                                    THEN 'PASS'
7038                                                    ELSE 'FILTERED'
7039                                                END, '"""
7040                                        else:
7041                                            pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '"
7042                                        pztags_sep = pztags_sep_default
7043
7044                            # Add Query update for PZFlags
7045                            sql_update_pztags = f"""
7046                                UPDATE {table_variants}
7047                                SET INFO = concat(
7048                                        INFO,
7049                                        CASE WHEN INFO NOT in ('','.')
7050                                                THEN ';'
7051                                                ELSE ''
7052                                        END,
7053                                        'PZTags{pzfields_sep}{profile}={pztags_value}'
7054                                    )
7055                                """
7056                            sql_queries.append(sql_update_pztags)
7057
7058                            # Add Query update for PZFlags for default
7059                            if profile == default_profile:
7060                                sql_update_pztags_default = f"""
7061                                UPDATE {table_variants}
7062                                SET INFO = concat(
7063                                        INFO,
7064                                        ';',
7065                                        'PZTags={pztags_value}'
7066                                    )
7067                                """
7068                                sql_queries.append(sql_update_pztags_default)
7069
7070                        log.info(f"""Profile '{profile}' - Prioritization... """)
7071
7072                        if sql_queries:
7073
7074                            for sql_query in sql_queries:
7075                                log.debug(
7076                                    f"""Profile '{profile}' - Prioritization query: {sql_query}... """
7077                                )
7078                                self.conn.execute(sql_query)
7079
7080                        log.info(f"""Profile '{profile}' - Update... """)
7081                        sql_query_update = f"""
7082                            UPDATE {table_variants}
7083                            SET INFO =  
7084                                concat(
7085                                    CASE
7086                                        WHEN INFO NOT IN ('','.')
7087                                        THEN concat(INFO, ';')
7088                                        ELSE ''
7089                                    END
7090                                    {sql_set_info_option}
7091                                )
7092                        """
7093                        self.conn.execute(sql_query_update)
7094
7095        else:
7096
7097            log.warning(f"No profiles in parameters")
7098
7099        # Remove added columns
7100        for added_column in added_columns:
7101            self.drop_column(column=added_column)
7102
7103        # Explode INFOS fields into table fields
7104        if self.get_explode_infos():
7105            self.explode_infos(
7106                prefix=self.get_explode_infos_prefix(),
7107                fields=self.get_explode_infos_fields(),
7108                force=True,
7109            )
7110
7111        return
7112
7113    ###
7114    # HGVS
7115    ###
7116
7117    def annotation_hgvs(self, threads: int = None) -> None:
7118        """
7119        The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic
7120        coordinates and alleles.
7121
7122        :param threads: The `threads` parameter is an optional integer that specifies the number of
7123        threads to use for parallel processing. If no value is provided, it will default to the number
7124        of threads obtained from the `get_threads()` method
7125        :type threads: int
7126        """
7127
7128        # Function for each partition of the Dask Dataframe
7129        def partition_function(partition):
7130            """
7131            The function `partition_function` applies the `annotation_hgvs_partition` function to
7132            each row of a DataFrame called `partition`.
7133
7134            :param partition: The parameter "partition" is a pandas DataFrame that contains the data
7135            to be processed
7136            :return: the result of applying the "annotation_hgvs_partition" function to each row of
7137            the "partition" dataframe along the axis 1.
7138            """
7139            return partition.apply(annotation_hgvs_partition, axis=1)
7140
7141        def annotation_hgvs_partition(row) -> str:
7142            """
7143            The function `annotation_hgvs_partition` takes in a row of data and returns a string
7144            containing a list of HGVS names associated with the given genomic coordinates and alleles.
7145
7146            :param row: A dictionary-like object that contains the values for the following keys:
7147            :return: a string that contains the HGVS names associated with the given row of data.
7148            """
7149
7150            chr = row["CHROM"]
7151            pos = row["POS"]
7152            ref = row["REF"]
7153            alt = row["ALT"]
7154
7155            # Find list of associated transcripts
7156            transcripts_list = list(
7157                polars_conn.execute(
7158                    f"""
7159                SELECT transcript
7160                FROM refseq_df
7161                WHERE CHROM='{chr}'
7162                AND POS={pos}
7163            """
7164                )["transcript"]
7165            )
7166
7167            # Full HGVS annotation in list
7168            hgvs_full_list = []
7169
7170            for transcript_name in transcripts_list:
7171
7172                # Transcript
7173                transcript = get_transcript(
7174                    transcripts=transcripts, transcript_name=transcript_name
7175                )
7176                # Exon
7177                if use_exon:
7178                    exon = transcript.find_exon_number(pos)
7179                else:
7180                    exon = None
7181                # Protein
7182                transcript_protein = None
7183                if use_protein or add_protein or full_format:
7184                    transcripts_protein = list(
7185                        polars_conn.execute(
7186                            f"""
7187                        SELECT protein
7188                        FROM refseqlink_df
7189                        WHERE transcript='{transcript_name}'
7190                        LIMIT 1
7191                    """
7192                        )["protein"]
7193                    )
7194                    if len(transcripts_protein):
7195                        transcript_protein = transcripts_protein[0]
7196
7197                # HGVS name
7198                hgvs_name = format_hgvs_name(
7199                    chr,
7200                    pos,
7201                    ref,
7202                    alt,
7203                    genome=genome,
7204                    transcript=transcript,
7205                    transcript_protein=transcript_protein,
7206                    exon=exon,
7207                    use_gene=use_gene,
7208                    use_protein=use_protein,
7209                    full_format=full_format,
7210                    use_version=use_version,
7211                    codon_type=codon_type,
7212                )
7213                hgvs_full_list.append(hgvs_name)
7214                if add_protein and not use_protein and not full_format:
7215                    hgvs_name = format_hgvs_name(
7216                        chr,
7217                        pos,
7218                        ref,
7219                        alt,
7220                        genome=genome,
7221                        transcript=transcript,
7222                        transcript_protein=transcript_protein,
7223                        exon=exon,
7224                        use_gene=use_gene,
7225                        use_protein=True,
7226                        full_format=False,
7227                        use_version=use_version,
7228                        codon_type=codon_type,
7229                    )
7230                    hgvs_full_list.append(hgvs_name)
7231
7232            # Create liste of HGVS annotations
7233            hgvs_full = ",".join(hgvs_full_list)
7234
7235            return hgvs_full
7236
7237        # Polars connexion
7238        polars_conn = pl.SQLContext(register_globals=True, eager=True)
7239
7240        # Config
7241        config = self.get_config()
7242
7243        # Databases
7244        # Genome
7245        databases_genomes_folders = (
7246            config.get("folders", {})
7247            .get("databases", {})
7248            .get("genomes", DEFAULT_GENOME_FOLDER)
7249        )
7250        databases_genome = (
7251            config.get("folders", {}).get("databases", {}).get("genomes", "")
7252        )
7253        # refseq database folder
7254        databases_refseq_folders = (
7255            config.get("folders", {})
7256            .get("databases", {})
7257            .get("refseq", DEFAULT_REFSEQ_FOLDER)
7258        )
7259        # refseq
7260        databases_refseq = config.get("databases", {}).get("refSeq", None)
7261        # refSeqLink
7262        databases_refseqlink = config.get("databases", {}).get("refSeqLink", None)
7263
7264        # Param
7265        param = self.get_param()
7266
7267        # Quick HGVS
7268        if "hgvs_options" in param and param.get("hgvs_options", ""):
7269            log.info(f"Quick HGVS Annotation:")
7270            if not param.get("hgvs", None):
7271                param["hgvs"] = {}
7272            for option in param.get("hgvs_options", "").split(","):
7273                option_var_val = option.split("=")
7274                option_var = option_var_val[0]
7275                if len(option_var_val) > 1:
7276                    option_val = option_var_val[1]
7277                else:
7278                    option_val = "True"
7279                if option_val.upper() in ["TRUE"]:
7280                    option_val = True
7281                elif option_val.upper() in ["FALSE"]:
7282                    option_val = False
7283                log.info(f"   {option_var}={option_val}")
7284                param["hgvs"][option_var] = option_val
7285
7286        # Check if HGVS annotation enabled
7287        if "hgvs" in param:
7288            log.info(f"HGVS Annotation... ")
7289            for hgvs_option in param.get("hgvs", {}):
7290                log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}")
7291        else:
7292            return
7293
7294        # HGVS Param
7295        param_hgvs = param.get("hgvs", {})
7296        use_exon = param_hgvs.get("use_exon", False)
7297        use_gene = param_hgvs.get("use_gene", False)
7298        use_protein = param_hgvs.get("use_protein", False)
7299        add_protein = param_hgvs.get("add_protein", False)
7300        full_format = param_hgvs.get("full_format", False)
7301        use_version = param_hgvs.get("use_version", False)
7302        codon_type = param_hgvs.get("codon_type", "3")
7303
7304        # refSseq refSeqLink
7305        databases_refseq = param_hgvs.get("refseq", databases_refseq)
7306        databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink)
7307
7308        # Assembly
7309        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
7310
7311        # Genome
7312        genome_file = None
7313        if find_genome(databases_genome):
7314            genome_file = find_genome(databases_genome)
7315        else:
7316            genome_file = find_genome(
7317                genome_path=databases_genomes_folders, assembly=assembly
7318            )
7319        log.debug("Genome: " + str(genome_file))
7320
7321        # refSseq
7322        refseq_file = find_file_prefix(
7323            input_file=databases_refseq,
7324            prefix="ncbiRefSeq",
7325            folder=databases_refseq_folders,
7326            assembly=assembly,
7327        )
7328        log.debug("refSeq: " + str(refseq_file))
7329
7330        # refSeqLink
7331        refseqlink_file = find_file_prefix(
7332            input_file=databases_refseqlink,
7333            prefix="ncbiRefSeqLink",
7334            folder=databases_refseq_folders,
7335            assembly=assembly,
7336        )
7337        log.debug("refSeqLink: " + str(refseqlink_file))
7338
7339        # Threads
7340        if not threads:
7341            threads = self.get_threads()
7342        log.debug("Threads: " + str(threads))
7343
7344        # Variables
7345        table_variants = self.get_table_variants(clause="update")
7346
7347        # Get variants SNV and InDel only
7348        query_variants = f"""
7349            SELECT "#CHROM" AS CHROM, POS, REF, ALT
7350            FROM {table_variants}
7351            WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$'
7352            """
7353        df_variants = self.get_query_to_df(query_variants)
7354
7355        # Added columns
7356        added_columns = []
7357
7358        # Add hgvs column in variants table
7359        hgvs_column_name = "hgvs_" + str(random.randrange(1000))
7360        added_column = self.add_column(
7361            table_variants, hgvs_column_name, "STRING", default_value=None
7362        )
7363        added_columns.append(added_column)
7364
7365        log.debug(f"refSeq loading...")
7366        # refSeq in duckDB
7367        refseq_table = get_refseq_table(
7368            conn=self.conn, refseq_table="refseq", refseq_file=refseq_file
7369        )
7370        # Loading all refSeq in Dataframe
7371        refseq_query = f"""
7372            SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript
7373            FROM {refseq_table}
7374            JOIN df_variants ON (
7375                {refseq_table}.chrom = df_variants.CHROM
7376                AND {refseq_table}.txStart<=df_variants.POS
7377                AND {refseq_table}.txEnd>=df_variants.POS
7378            )
7379        """
7380        refseq_df = self.conn.query(refseq_query).pl()
7381
7382        if refseqlink_file:
7383            log.debug(f"refSeqLink loading...")
7384            # refSeqLink in duckDB
7385            refseqlink_table = get_refseq_table(
7386                conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file
7387            )
7388            # Loading all refSeqLink in Dataframe
7389            protacc_column = "protAcc_with_ver"
7390            mrnaacc_column = "mrnaAcc_with_ver"
7391            refseqlink_query = f"""
7392                SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript
7393                FROM {refseqlink_table} 
7394                JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver)
7395                WHERE protAcc_without_ver IS NOT NULL
7396            """
7397            # Polars Dataframe
7398            refseqlink_df = self.conn.query(f"{refseqlink_query}").pl()
7399
7400        # Read RefSeq transcripts into a python dict/model.
7401        log.debug(f"Transcripts loading...")
7402        with tempfile.TemporaryDirectory() as tmpdir:
7403            transcripts_query = f"""
7404                COPY (
7405                    SELECT {refseq_table}.*
7406                    FROM {refseq_table}
7407                    JOIN df_variants ON (
7408                        {refseq_table}.chrom=df_variants.CHROM
7409                        AND {refseq_table}.txStart<=df_variants.POS
7410                        AND {refseq_table}.txEnd>=df_variants.POS
7411                    )
7412                )
7413                TO '{tmpdir}/transcript.tsv' (DELIMITER '\t');
7414            """
7415            self.conn.query(transcripts_query)
7416            with open(f"{tmpdir}/transcript.tsv") as infile:
7417                transcripts = read_transcripts(infile)
7418
7419        # Polars connexion
7420        polars_conn = pl.SQLContext(register_globals=True, eager=True)
7421
7422        log.debug("Genome loading...")
7423        # Read genome sequence using pyfaidx.
7424        genome = Fasta(genome_file)
7425
7426        log.debug("Start annotation HGVS...")
7427
7428        # Create
7429        # a Dask Dataframe from Pandas dataframe with partition as number of threads
7430        ddf = dd.from_pandas(df_variants, npartitions=threads)
7431
7432        # Use dask.dataframe.apply() to apply function on each partition
7433        ddf[hgvs_column_name] = ddf.map_partitions(partition_function)
7434
7435        # Convert Dask DataFrame to Pandas Dataframe
7436        df = ddf.compute()
7437
7438        # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???)
7439        with tempfile.TemporaryDirectory() as tmpdir:
7440            df_parquet = os.path.join(tmpdir, "df.parquet")
7441            df.to_parquet(df_parquet)
7442
7443            # Update hgvs column
7444            update_variant_query = f"""
7445                UPDATE {table_variants}
7446                SET "{hgvs_column_name}"=df."{hgvs_column_name}"
7447                FROM read_parquet('{df_parquet}') as df
7448                WHERE variants."#CHROM" = df.CHROM
7449                AND variants.POS = df.POS
7450                AND variants.REF = df.REF
7451                AND variants.ALT = df.ALT
7452                AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL
7453                """
7454            self.execute_query(update_variant_query)
7455
7456        # Update INFO column
7457        sql_query_update = f"""
7458            UPDATE {table_variants}
7459            SET INFO = 
7460                concat(
7461                    CASE 
7462                        WHEN INFO NOT IN ('','.')
7463                        THEN concat(INFO, ';')
7464                        ELSE ''
7465                    END,
7466                    'hgvs=',
7467                    {hgvs_column_name}
7468                )
7469            WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL
7470            """
7471        self.execute_query(sql_query_update)
7472
7473        # Add header
7474        HGVS_INFOS = {
7475            "hgvs": {
7476                "ID": "hgvs",
7477                "Number": ".",
7478                "Type": "String",
7479                "Description": f"HGVS annotatation with HOWARD",
7480            }
7481        }
7482
7483        for field in HGVS_INFOS:
7484            field_ID = HGVS_INFOS[field]["ID"]
7485            field_description = HGVS_INFOS[field]["Description"]
7486            self.get_header().infos[field_ID] = vcf.parser._Info(
7487                field_ID,
7488                HGVS_INFOS[field]["Number"],
7489                HGVS_INFOS[field]["Type"],
7490                field_description,
7491                "unknown",
7492                "unknown",
7493                code_type_map[HGVS_INFOS[field]["Type"]],
7494            )
7495
7496        # Remove added columns
7497        for added_column in added_columns:
7498            self.drop_column(column=added_column)
7499
7500    ###
7501    # Calculation
7502    ###
7503
7504    def get_operations_help(
7505        self, operations_config_dict: dict = {}, operations_config_file: str = None
7506    ) -> list:
7507
7508        # Init
7509        operations_help = []
7510
7511        # operations
7512        operations = self.get_config_json(
7513            name="calculations",
7514            config_dict=operations_config_dict,
7515            config_file=operations_config_file,
7516        )
7517        for op in operations:
7518            op_name = operations[op].get("name", op).upper()
7519            op_description = operations[op].get("description", op_name)
7520            op_available = operations[op].get("available", False)
7521            if op_available:
7522                operations_help.append(f"   {op_name}: {op_description}")
7523
7524        # Sort operations
7525        operations_help.sort()
7526
7527        # insert header
7528        operations_help.insert(0, "Available calculation operations:")
7529
7530        # Return
7531        return operations_help
7532
7533    def calculation(
7534        self,
7535        operations: dict = {},
7536        operations_config_dict: dict = {},
7537        operations_config_file: str = None,
7538    ) -> None:
7539        """
7540        It takes a list of operations, and for each operation, it checks if it's a python or sql
7541        operation, and then calls the appropriate function
7542
7543        param json example:
7544            "calculation": {
7545                "NOMEN": {
7546                    "options": {
7547                        "hgvs_field": "hgvs"
7548                    },
7549                "middle" : null
7550            }
7551        """
7552
7553        # Param
7554        param = self.get_param()
7555
7556        # operations config
7557        operations_config = self.get_config_json(
7558            name="calculations",
7559            config_dict=operations_config_dict,
7560            config_file=operations_config_file,
7561        )
7562
7563        # Upper keys
7564        operations_config = {k.upper(): v for k, v in operations_config.items()}
7565
7566        # Calculations
7567
7568        # Operations from param
7569        operations = param.get("calculation", {}).get("calculations", operations)
7570
7571        # Quick calculation - add
7572        if param.get("calculations", None):
7573            calculations_list = [
7574                value for value in param.get("calculations", "").split(",")
7575            ]
7576            log.info(f"Quick Calculations:")
7577            for calculation_key in calculations_list:
7578                log.info(f"   {calculation_key}")
7579            for calculation_operation in calculations_list:
7580                if calculation_operation.upper() not in operations:
7581                    operations[calculation_operation.upper()] = {}
7582                    add_value_into_dict(
7583                        dict_tree=param,
7584                        sections=[
7585                            "calculation",
7586                            "calculations",
7587                            calculation_operation.upper(),
7588                        ],
7589                        value={},
7590                    )
7591
7592        # Operations for calculation
7593        if not operations:
7594            operations = param.get("calculation", {}).get("calculations", {})
7595
7596        if operations:
7597            log.info(f"Calculations...")
7598
7599        # For each operations
7600        for operation_name in operations:
7601            operation_name = operation_name.upper()
7602            if operation_name not in [""]:
7603                if operation_name in operations_config:
7604                    log.info(f"Calculation '{operation_name}'")
7605                    operation = operations_config[operation_name]
7606                    operation_type = operation.get("type", "sql")
7607                    if operation_type == "python":
7608                        self.calculation_process_function(
7609                            operation=operation, operation_name=operation_name
7610                        )
7611                    elif operation_type == "sql":
7612                        self.calculation_process_sql(
7613                            operation=operation, operation_name=operation_name
7614                        )
7615                    else:
7616                        log.error(
7617                            f"Operations config: Type '{operation_type}' NOT available"
7618                        )
7619                        raise ValueError(
7620                            f"Operations config: Type '{operation_type}' NOT available"
7621                        )
7622                else:
7623                    log.error(
7624                        f"Operations config: Calculation '{operation_name}' NOT available"
7625                    )
7626                    raise ValueError(
7627                        f"Operations config: Calculation '{operation_name}' NOT available"
7628                    )
7629
7630        # Explode INFOS fields into table fields
7631        if self.get_explode_infos():
7632            self.explode_infos(
7633                prefix=self.get_explode_infos_prefix(),
7634                fields=self.get_explode_infos_fields(),
7635                force=True,
7636            )
7637
7638    def calculation_process_sql(
7639        self, operation: dict, operation_name: str = "unknown"
7640    ) -> None:
7641        """
7642        The `calculation_process_sql` function takes in a mathematical operation as a string and
7643        performs the operation, updating the specified table with the result.
7644
7645        :param operation: The `operation` parameter is a dictionary that contains information about the
7646        mathematical operation to be performed. It includes the following keys:
7647        :type operation: dict
7648        :param operation_name: The `operation_name` parameter is a string that represents the name of
7649        the mathematical operation being performed. It is used for logging and error handling purposes,
7650        defaults to unknown
7651        :type operation_name: str (optional)
7652        """
7653
7654        # table variants
7655        table_variants = self.get_table_variants(clause="alter")
7656
7657        # Operation infos
7658        operation_name = operation.get("name", "unknown")
7659        log.debug(f"process sql {operation_name}")
7660        output_column_name = operation.get("output_column_name", operation_name)
7661        output_column_type = operation.get("output_column_type", "String")
7662        prefix = operation.get("explode_infos_prefix", "")
7663        output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR")
7664        output_column_description = operation.get(
7665            "output_column_description", f"{operation_name} operation"
7666        )
7667        operation_query = operation.get("operation_query", None)
7668        if isinstance(operation_query, list):
7669            operation_query = " ".join(operation_query)
7670        operation_info_fields = operation.get("info_fields", [])
7671        operation_info_fields_check = operation.get("info_fields_check", False)
7672        operation_info = operation.get("operation_info", True)
7673
7674        if operation_query:
7675
7676            # Info fields check
7677            operation_info_fields_check_result = True
7678            if operation_info_fields_check:
7679                header_infos = self.get_header().infos
7680                for info_field in operation_info_fields:
7681                    operation_info_fields_check_result = (
7682                        operation_info_fields_check_result
7683                        and info_field in header_infos
7684                    )
7685
7686            # If info fields available
7687            if operation_info_fields_check_result:
7688
7689                # Added_columns
7690                added_columns = []
7691
7692                # Create VCF header field
7693                vcf_reader = self.get_header()
7694                vcf_reader.infos[output_column_name] = vcf.parser._Info(
7695                    output_column_name,
7696                    ".",
7697                    output_column_type,
7698                    output_column_description,
7699                    "howard calculation",
7700                    "0",
7701                    self.code_type_map.get(output_column_type),
7702                )
7703
7704                # Explode infos if needed
7705                log.debug(f"calculation_process_sql prefix {prefix}")
7706                added_columns += self.explode_infos(
7707                    prefix=prefix,
7708                    fields=[output_column_name] + operation_info_fields,
7709                    force=True,
7710                )
7711
7712                # Create column
7713                added_column = self.add_column(
7714                    table_name=table_variants,
7715                    column_name=prefix + output_column_name,
7716                    column_type=output_column_type_sql,
7717                    default_value="null",
7718                )
7719                added_columns.append(added_column)
7720
7721                # Operation calculation
7722                try:
7723
7724                    # Query to update calculation column
7725                    sql_update = f"""
7726                        UPDATE {table_variants}
7727                        SET "{prefix}{output_column_name}" = ({operation_query})
7728                    """
7729                    self.conn.execute(sql_update)
7730
7731                    # Add to INFO
7732                    if operation_info:
7733                        sql_update_info = f"""
7734                            UPDATE {table_variants}
7735                            SET "INFO" =
7736                                concat(
7737                                    CASE
7738                                        WHEN "INFO" IS NOT NULL
7739                                        THEN concat("INFO", ';')
7740                                        ELSE ''
7741                                    END,
7742                                    '{output_column_name}=',
7743                                    "{prefix}{output_column_name}"
7744                                )
7745                            WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('')
7746                        """
7747                        self.conn.execute(sql_update_info)
7748
7749                except:
7750                    log.error(
7751                        f"Operations config: Calculation '{operation_name}' query failed"
7752                    )
7753                    raise ValueError(
7754                        f"Operations config: Calculation '{operation_name}' query failed"
7755                    )
7756
7757                # Remove added columns
7758                for added_column in added_columns:
7759                    log.debug(f"added_column: {added_column}")
7760                    self.drop_column(column=added_column)
7761
7762            else:
7763                log.error(
7764                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
7765                )
7766                raise ValueError(
7767                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
7768                )
7769
7770        else:
7771            log.error(
7772                f"Operations config: Calculation '{operation_name}' query NOT defined"
7773            )
7774            raise ValueError(
7775                f"Operations config: Calculation '{operation_name}' query NOT defined"
7776            )
7777
7778    def calculation_process_function(
7779        self, operation: dict, operation_name: str = "unknown"
7780    ) -> None:
7781        """
7782        The `calculation_process_function` takes in an operation dictionary and performs the specified
7783        function with the given parameters.
7784
7785        :param operation: The `operation` parameter is a dictionary that contains information about the
7786        operation to be performed. It has the following keys:
7787        :type operation: dict
7788        :param operation_name: The `operation_name` parameter is a string that represents the name of
7789        the operation being performed. It is used for logging purposes, defaults to unknown
7790        :type operation_name: str (optional)
7791        """
7792
7793        operation_name = operation["name"]
7794        log.debug(f"process sql {operation_name}")
7795        function_name = operation["function_name"]
7796        function_params = operation["function_params"]
7797        getattr(self, function_name)(*function_params)
7798
7799    def calculation_variant_id(self) -> None:
7800        """
7801        The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and
7802        updates the INFO field of a variants table with the variant ID.
7803        """
7804
7805        # variant_id annotation field
7806        variant_id_tag = self.get_variant_id_column()
7807        added_columns = [variant_id_tag]
7808
7809        # variant_id hgvs tags"
7810        vcf_infos_tags = {
7811            variant_id_tag: "howard variant ID annotation",
7812        }
7813
7814        # Variants table
7815        table_variants = self.get_table_variants()
7816
7817        # Header
7818        vcf_reader = self.get_header()
7819
7820        # Add variant_id to header
7821        vcf_reader.infos[variant_id_tag] = vcf.parser._Info(
7822            variant_id_tag,
7823            ".",
7824            "String",
7825            vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"),
7826            "howard calculation",
7827            "0",
7828            self.code_type_map.get("String"),
7829        )
7830
7831        # Update
7832        sql_update = f"""
7833            UPDATE {table_variants}
7834            SET "INFO" = 
7835                concat(
7836                    CASE
7837                        WHEN "INFO" IS NULL OR "INFO" IN ('','.')
7838                        THEN ''
7839                        ELSE concat("INFO", ';')
7840                    END,
7841                    '{variant_id_tag}=',
7842                    "{variant_id_tag}"
7843                )
7844        """
7845        self.conn.execute(sql_update)
7846
7847        # Remove added columns
7848        for added_column in added_columns:
7849            self.drop_column(column=added_column)
7850
7851    def calculation_extract_snpeff_hgvs(
7852        self,
7853        snpeff_hgvs: str = "snpeff_hgvs",
7854        snpeff_field: str = "ANN",
7855    ) -> None:
7856        """
7857        The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff
7858        annotation field in a VCF file and adds them as a new column in the variants table.
7859
7860        :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs`
7861        function is used to specify the name of the column that will store the HGVS nomenclatures
7862        extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to
7863        snpeff_hgvs
7864        :type snpeff_hgvs: str (optional)
7865        :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs`
7866        function represents the field in the VCF file that contains SnpEff annotations. This field is
7867        used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults
7868        to ANN
7869        :type snpeff_field: str (optional)
7870        """
7871
7872        # Snpeff hgvs tags
7873        vcf_infos_tags = {
7874            snpeff_hgvs: "HGVS nomenclatures from snpEff annotation",
7875        }
7876
7877        # Prefix
7878        prefix = self.get_explode_infos_prefix()
7879        if prefix:
7880            prefix = "INFO/"
7881
7882        # snpEff fields
7883        speff_ann_infos = prefix + snpeff_field
7884        speff_hgvs_infos = prefix + snpeff_hgvs
7885
7886        # Variants table
7887        table_variants = self.get_table_variants()
7888
7889        # Header
7890        vcf_reader = self.get_header()
7891
7892        # Add columns
7893        added_columns = []
7894
7895        # Explode HGVS field in column
7896        added_columns += self.explode_infos(fields=[snpeff_field])
7897
7898        if snpeff_field in vcf_reader.infos:
7899
7900            log.debug(vcf_reader.infos[snpeff_field])
7901
7902            # Extract ANN header
7903            ann_description = vcf_reader.infos[snpeff_field].desc
7904            pattern = r"'(.+?)'"
7905            match = re.search(pattern, ann_description)
7906            if match:
7907                ann_header_match = match.group(1).split(" | ")
7908                ann_header_desc = {}
7909                for i in range(len(ann_header_match)):
7910                    ann_header_info = "".join(
7911                        char for char in ann_header_match[i] if char.isalnum()
7912                    )
7913                    ann_header_desc[ann_header_info] = ann_header_match[i]
7914                if not ann_header_desc:
7915                    raise ValueError("Invalid header description format")
7916            else:
7917                raise ValueError("Invalid header description format")
7918
7919            # Create variant id
7920            variant_id_column = self.get_variant_id_column()
7921            added_columns += [variant_id_column]
7922
7923            # Create dataframe
7924            dataframe_snpeff_hgvs = self.get_query_to_df(
7925                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
7926            )
7927
7928            # Create main NOMEN column
7929            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
7930                speff_ann_infos
7931            ].apply(
7932                lambda x: extract_snpeff_hgvs(
7933                    str(x), header=list(ann_header_desc.values())
7934                )
7935            )
7936
7937            # Add snpeff_hgvs to header
7938            vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info(
7939                snpeff_hgvs,
7940                ".",
7941                "String",
7942                vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"),
7943                "howard calculation",
7944                "0",
7945                self.code_type_map.get("String"),
7946            )
7947
7948            # Update
7949            sql_update = f"""
7950                UPDATE variants
7951                SET "INFO" = 
7952                    concat(
7953                        CASE
7954                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
7955                            THEN ''
7956                            ELSE concat("INFO", ';')
7957                        END,
7958                        CASE 
7959                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
7960                            AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
7961                            THEN concat(
7962                                    '{snpeff_hgvs}=',
7963                                    dataframe_snpeff_hgvs."{speff_hgvs_infos}"
7964                                )
7965                            ELSE ''
7966                        END
7967                    )
7968                FROM dataframe_snpeff_hgvs
7969                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
7970
7971            """
7972            self.conn.execute(sql_update)
7973
7974            # Delete dataframe
7975            del dataframe_snpeff_hgvs
7976            gc.collect()
7977
7978        else:
7979
7980            log.warning(
7981                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
7982            )
7983
7984        # Remove added columns
7985        for added_column in added_columns:
7986            self.drop_column(column=added_column)
7987
7988    def calculation_snpeff_ann_explode(
7989        self,
7990        uniquify: bool = True,
7991        output_format: str = "fields",
7992        output_prefix: str = "snpeff_",
7993        snpeff_field: str = "ANN",
7994    ) -> None:
7995        """
7996        The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by
7997        exploding the HGVS field and updating variant information accordingly.
7998
7999        :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a
8000        boolean flag that determines whether the output should be uniquified or not. When set to `True`,
8001        it indicates that the output should be unique, meaning that duplicate entries should be removed,
8002        defaults to True
8003        :type uniquify: bool (optional)
8004        :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode`
8005        function specifies the format in which the output annotations will be generated. It has a
8006        default value of "fields". You can also set it to "JSON" to output the annotations in JSON
8007        format, defaults to fields
8008        :type output_format: str (optional)
8009        :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode`
8010        method is used to specify the prefix that will be added to the output annotations generated
8011        during the calculation process. This prefix helps to differentiate the newly added annotations
8012        from existing ones in the output data. By default, the, defaults to ANN_
8013        :type output_prefix: str (optional)
8014        :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode`
8015        function is used to specify the field in the VCF file that contains SnpEff annotations. This
8016        field will be processed to explode the HGVS annotations and update the variant information
8017        accordingly, defaults to ANN
8018        :type snpeff_field: str (optional)
8019        """
8020
8021        # SnpEff annotation field
8022        snpeff_hgvs = "snpeff_ann_explode"
8023
8024        # Snpeff hgvs tags
8025        vcf_infos_tags = {
8026            snpeff_hgvs: "Explode snpEff annotations",
8027        }
8028
8029        # Prefix
8030        prefix = self.get_explode_infos_prefix()
8031        if prefix:
8032            prefix = "INFO/"
8033
8034        # snpEff fields
8035        speff_ann_infos = prefix + snpeff_field
8036        speff_hgvs_infos = prefix + snpeff_hgvs
8037
8038        # Variants table
8039        table_variants = self.get_table_variants()
8040
8041        # Header
8042        vcf_reader = self.get_header()
8043
8044        # Add columns
8045        added_columns = []
8046
8047        # Explode HGVS field in column
8048        added_columns += self.explode_infos(fields=[snpeff_field])
8049        log.debug(f"snpeff_field={snpeff_field}")
8050        log.debug(f"added_columns={added_columns}")
8051
8052        if snpeff_field in vcf_reader.infos:
8053
8054            # Extract ANN header
8055            ann_description = vcf_reader.infos[snpeff_field].desc
8056            pattern = r"'(.+?)'"
8057            match = re.search(pattern, ann_description)
8058            if match:
8059                ann_header_match = match.group(1).split(" | ")
8060                ann_header = []
8061                ann_header_desc = {}
8062                for i in range(len(ann_header_match)):
8063                    ann_header_info = "".join(
8064                        char for char in ann_header_match[i] if char.isalnum()
8065                    )
8066                    ann_header.append(ann_header_info)
8067                    ann_header_desc[ann_header_info] = ann_header_match[i]
8068                if not ann_header_desc:
8069                    raise ValueError("Invalid header description format")
8070            else:
8071                raise ValueError("Invalid header description format")
8072
8073            # Create variant id
8074            variant_id_column = self.get_variant_id_column()
8075            added_columns += [variant_id_column]
8076
8077            # Create dataframe
8078            dataframe_snpeff_hgvs = self.get_query_to_df(
8079                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
8080            )
8081
8082            # Create snpEff columns
8083            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
8084                speff_ann_infos
8085            ].apply(
8086                lambda x: explode_snpeff_ann(
8087                    str(x),
8088                    uniquify=uniquify,
8089                    output_format=output_format,
8090                    prefix=output_prefix,
8091                    header=list(ann_header_desc.values()),
8092                )
8093            )
8094
8095            # Header
8096            ann_annotations_prefix = ""
8097            if output_format.upper() in ["JSON"]:
8098                ann_annotations_prefix = f"{output_prefix}="
8099                vcf_reader.infos[output_prefix] = vcf.parser._Info(
8100                    output_prefix,
8101                    ".",
8102                    "String",
8103                    vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
8104                    + " - JSON format",
8105                    "howard calculation",
8106                    "0",
8107                    self.code_type_map.get("String"),
8108                )
8109            else:
8110                for ann_annotation in ann_header:
8111                    ann_annotation_id = f"{output_prefix}{ann_annotation}"
8112                    vcf_reader.infos[ann_annotation_id] = vcf.parser._Info(
8113                        ann_annotation_id,
8114                        ".",
8115                        "String",
8116                        vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
8117                        + f" - '{ann_header_desc[ann_annotation]}' annotation",
8118                        "howard calculation",
8119                        "0",
8120                        self.code_type_map.get("String"),
8121                    )
8122
8123            # Update
8124            sql_update = f"""
8125                UPDATE variants
8126                SET "INFO" = 
8127                    concat(
8128                        CASE
8129                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8130                            THEN ''
8131                            ELSE concat("INFO", ';')
8132                        END,
8133                        CASE 
8134                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
8135                                AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
8136                            THEN concat(
8137                                '{ann_annotations_prefix}',
8138                                dataframe_snpeff_hgvs."{speff_hgvs_infos}"
8139                                )
8140                            ELSE ''
8141                        END
8142                    )
8143                FROM dataframe_snpeff_hgvs
8144                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
8145
8146            """
8147            self.conn.execute(sql_update)
8148
8149            # Delete dataframe
8150            del dataframe_snpeff_hgvs
8151            gc.collect()
8152
8153        else:
8154
8155            log.warning(
8156                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
8157            )
8158
8159        # Remove added columns
8160        for added_column in added_columns:
8161            self.drop_column(column=added_column)
8162
8163    def calculation_extract_nomen(self) -> None:
8164        """
8165        This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.
8166        """
8167
8168        # NOMEN field
8169        field_nomen_dict = "NOMEN_DICT"
8170
8171        # NOMEN structure
8172        nomen_dict = {
8173            "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)",
8174            "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)",
8175            "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)",
8176            "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant",
8177            "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)",
8178            "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)",
8179            "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)",
8180            "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)",
8181            "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)",
8182            "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)",
8183        }
8184
8185        # Param
8186        param = self.get_param()
8187
8188        # Prefix
8189        prefix = self.get_explode_infos_prefix()
8190
8191        # Header
8192        vcf_reader = self.get_header()
8193
8194        # Get HGVS field
8195        hgvs_field = (
8196            param.get("calculation", {})
8197            .get("calculations", {})
8198            .get("NOMEN", {})
8199            .get("options", {})
8200            .get("hgvs_field", "hgvs")
8201        )
8202
8203        # Get transcripts
8204        transcripts_file = (
8205            param.get("calculation", {})
8206            .get("calculations", {})
8207            .get("NOMEN", {})
8208            .get("options", {})
8209            .get("transcripts", None)
8210        )
8211        transcripts_file = full_path(transcripts_file)
8212        transcripts = []
8213        if transcripts_file:
8214            if os.path.exists(transcripts_file):
8215                transcripts_dataframe = transcripts_file_to_df(transcripts_file)
8216                transcripts = transcripts_dataframe.iloc[:, 0].tolist()
8217            else:
8218                log.error(f"Transcript file '{transcripts_file}' does NOT exist")
8219                raise ValueError(f"Transcript file '{transcripts_file}' does NOT exist")
8220
8221        # Added columns
8222        added_columns = []
8223
8224        # Explode HGVS field in column
8225        added_columns += self.explode_infos(fields=[hgvs_field])
8226
8227        # extra infos
8228        extra_infos = self.get_extra_infos()
8229        extra_field = prefix + hgvs_field
8230
8231        if extra_field in extra_infos:
8232
8233            # Create dataframe
8234            dataframe_hgvs = self.get_query_to_df(
8235                f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" FROM variants """
8236            )
8237
8238            # Create main NOMEN column
8239            dataframe_hgvs[field_nomen_dict] = dataframe_hgvs[extra_field].apply(
8240                lambda x: find_nomen(str(x), transcripts=transcripts)
8241            )
8242
8243            # Explode NOMEN Structure and create SQL set for update
8244            sql_nomen_fields = []
8245            for nomen_field in nomen_dict:
8246
8247                # Explode each field into a column
8248                dataframe_hgvs[nomen_field] = dataframe_hgvs[field_nomen_dict].apply(
8249                    lambda x: dict(x).get(nomen_field, "")
8250                )
8251
8252                # Create VCF header field
8253                vcf_reader.infos[nomen_field] = vcf.parser._Info(
8254                    nomen_field,
8255                    ".",
8256                    "String",
8257                    nomen_dict.get(nomen_field, "howard calculation NOMEN"),
8258                    "howard calculation",
8259                    "0",
8260                    self.code_type_map.get("String"),
8261                )
8262                sql_nomen_fields.append(
8263                    f"""
8264                        CASE 
8265                            WHEN dataframe_hgvs."{nomen_field}" NOT NULL AND dataframe_hgvs."{nomen_field}" NOT IN ('')
8266                            THEN concat(
8267                                    ';{nomen_field}=',
8268                                    dataframe_hgvs."{nomen_field}"
8269                                )
8270                            ELSE ''
8271                        END
8272                    """
8273                )
8274
8275            # SQL set for update
8276            sql_nomen_fields_set = ", ".join(sql_nomen_fields)
8277
8278            # Update
8279            sql_update = f"""
8280                UPDATE variants
8281                SET "INFO" = 
8282                    concat(
8283                        CASE
8284                            WHEN "INFO" IS NULL
8285                            THEN ''
8286                            ELSE "INFO"
8287                        END,
8288                        {sql_nomen_fields_set}
8289                    )
8290                FROM dataframe_hgvs
8291                WHERE variants."#CHROM" = dataframe_hgvs."#CHROM"
8292                    AND variants."POS" = dataframe_hgvs."POS" 
8293                    AND variants."REF" = dataframe_hgvs."REF"
8294                    AND variants."ALT" = dataframe_hgvs."ALT"
8295            """
8296            self.conn.execute(sql_update)
8297
8298            # Delete dataframe
8299            del dataframe_hgvs
8300            gc.collect()
8301
8302        # Remove added columns
8303        for added_column in added_columns:
8304            self.drop_column(column=added_column)
8305
8306    def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None:
8307        """
8308        The function `calculation_find_by_pipeline` performs a calculation to find the number of
8309        pipeline/sample for a variant and updates the variant information in a VCF file.
8310
8311        :param tag: The `tag` parameter is a string that represents the annotation field for the
8312        "findbypipeline" information in the VCF file. It is used to create the annotation field in the
8313        VCF header and to update the corresponding field in the variants table, defaults to
8314        findbypipeline
8315        :type tag: str (optional)
8316        """
8317
8318        # if FORMAT and samples
8319        if (
8320            "FORMAT" in self.get_header_columns_as_list()
8321            and self.get_header_sample_list()
8322        ):
8323
8324            # findbypipeline annotation field
8325            findbypipeline_tag = tag
8326
8327            # VCF infos tags
8328            vcf_infos_tags = {
8329                findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})",
8330            }
8331
8332            # Prefix
8333            prefix = self.get_explode_infos_prefix()
8334
8335            # Field
8336            findbypipeline_infos = prefix + findbypipeline_tag
8337
8338            # Variants table
8339            table_variants = self.get_table_variants()
8340
8341            # Header
8342            vcf_reader = self.get_header()
8343
8344            # Create variant id
8345            variant_id_column = self.get_variant_id_column()
8346            added_columns = [variant_id_column]
8347
8348            # variant_id, FORMAT and samples
8349            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8350                self.get_header_sample_list()
8351            )
8352
8353            # Create dataframe
8354            dataframe_findbypipeline = self.get_query_to_df(
8355                f""" SELECT {samples_fields} FROM {table_variants} """
8356            )
8357
8358            # Create findbypipeline column
8359            dataframe_findbypipeline[findbypipeline_infos] = (
8360                dataframe_findbypipeline.apply(
8361                    lambda row: findbypipeline(
8362                        row, samples=self.get_header_sample_list()
8363                    ),
8364                    axis=1,
8365                )
8366            )
8367
8368            # Add snpeff_hgvs to header
8369            vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info(
8370                findbypipeline_tag,
8371                ".",
8372                "String",
8373                vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"),
8374                "howard calculation",
8375                "0",
8376                self.code_type_map.get("String"),
8377            )
8378
8379            # Update
8380            sql_update = f"""
8381                UPDATE variants
8382                SET "INFO" = 
8383                    concat(
8384                        CASE
8385                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8386                            THEN ''
8387                            ELSE concat("INFO", ';')
8388                        END,
8389                        CASE 
8390                            WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.')
8391                                AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL
8392                            THEN concat(
8393                                    '{findbypipeline_tag}=',
8394                                    dataframe_findbypipeline."{findbypipeline_infos}"
8395                                )
8396                            ELSE ''
8397                        END
8398                    )
8399                FROM dataframe_findbypipeline
8400                WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}"
8401            """
8402            self.conn.execute(sql_update)
8403
8404            # Remove added columns
8405            for added_column in added_columns:
8406                self.drop_column(column=added_column)
8407
8408            # Delete dataframe
8409            del dataframe_findbypipeline
8410            gc.collect()
8411
8412    def calculation_genotype_concordance(self) -> None:
8413        """
8414        The function `calculation_genotype_concordance` calculates the genotype concordance for
8415        multi-caller VCF files and updates the variant information in the database.
8416        """
8417
8418        # if FORMAT and samples
8419        if (
8420            "FORMAT" in self.get_header_columns_as_list()
8421            and self.get_header_sample_list()
8422        ):
8423
8424            # genotypeconcordance annotation field
8425            genotypeconcordance_tag = "genotypeconcordance"
8426
8427            # VCF infos tags
8428            vcf_infos_tags = {
8429                genotypeconcordance_tag: "Concordance of genotype for multi caller VCF",
8430            }
8431
8432            # Prefix
8433            prefix = self.get_explode_infos_prefix()
8434
8435            # Field
8436            genotypeconcordance_infos = prefix + genotypeconcordance_tag
8437
8438            # Variants table
8439            table_variants = self.get_table_variants()
8440
8441            # Header
8442            vcf_reader = self.get_header()
8443
8444            # Create variant id
8445            variant_id_column = self.get_variant_id_column()
8446            added_columns = [variant_id_column]
8447
8448            # variant_id, FORMAT and samples
8449            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8450                self.get_header_sample_list()
8451            )
8452
8453            # Create dataframe
8454            dataframe_genotypeconcordance = self.get_query_to_df(
8455                f""" SELECT {samples_fields} FROM {table_variants} """
8456            )
8457
8458            # Create genotypeconcordance column
8459            dataframe_genotypeconcordance[genotypeconcordance_infos] = (
8460                dataframe_genotypeconcordance.apply(
8461                    lambda row: genotypeconcordance(
8462                        row, samples=self.get_header_sample_list()
8463                    ),
8464                    axis=1,
8465                )
8466            )
8467
8468            # Add genotypeconcordance to header
8469            vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info(
8470                genotypeconcordance_tag,
8471                ".",
8472                "String",
8473                vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"),
8474                "howard calculation",
8475                "0",
8476                self.code_type_map.get("String"),
8477            )
8478
8479            # Update
8480            sql_update = f"""
8481                UPDATE variants
8482                SET "INFO" = 
8483                    concat(
8484                        CASE
8485                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8486                            THEN ''
8487                            ELSE concat("INFO", ';')
8488                        END,
8489                        CASE
8490                            WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.')
8491                                AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL
8492                            THEN concat(
8493                                    '{genotypeconcordance_tag}=',
8494                                    dataframe_genotypeconcordance."{genotypeconcordance_infos}"
8495                                )
8496                            ELSE ''
8497                        END
8498                    )
8499                FROM dataframe_genotypeconcordance
8500                WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}"
8501            """
8502            self.conn.execute(sql_update)
8503
8504            # Remove added columns
8505            for added_column in added_columns:
8506                self.drop_column(column=added_column)
8507
8508            # Delete dataframe
8509            del dataframe_genotypeconcordance
8510            gc.collect()
8511
8512    def calculation_barcode(self, tag: str = "barcode") -> None:
8513        """
8514        The `calculation_barcode` function calculates barcode values for variants in a VCF file and
8515        updates the INFO field in the file with the calculated barcode values.
8516
8517        :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag
8518        name that will be used for the barcode calculation in the VCF file. If no tag name is provided,
8519        the default tag name is set to "barcode", defaults to barcode
8520        :type tag: str (optional)
8521        """
8522
8523        # if FORMAT and samples
8524        if (
8525            "FORMAT" in self.get_header_columns_as_list()
8526            and self.get_header_sample_list()
8527        ):
8528
8529            # barcode annotation field
8530            if not tag:
8531                tag = "barcode"
8532
8533            # VCF infos tags
8534            vcf_infos_tags = {
8535                tag: "barcode calculation (VaRank)",
8536            }
8537
8538            # Prefix
8539            prefix = self.get_explode_infos_prefix()
8540
8541            # Field
8542            barcode_infos = prefix + tag
8543
8544            # Variants table
8545            table_variants = self.get_table_variants()
8546
8547            # Header
8548            vcf_reader = self.get_header()
8549
8550            # Create variant id
8551            variant_id_column = self.get_variant_id_column()
8552            added_columns = [variant_id_column]
8553
8554            # variant_id, FORMAT and samples
8555            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8556                self.get_header_sample_list()
8557            )
8558
8559            # Create dataframe
8560            dataframe_barcode = self.get_query_to_df(
8561                f""" SELECT {samples_fields} FROM {table_variants} """
8562            )
8563
8564            # Create barcode column
8565            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
8566                lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1
8567            )
8568
8569            # Add barcode to header
8570            vcf_reader.infos[tag] = vcf.parser._Info(
8571                tag,
8572                ".",
8573                "String",
8574                vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)),
8575                "howard calculation",
8576                "0",
8577                self.code_type_map.get("String"),
8578            )
8579
8580            # Update
8581            sql_update = f"""
8582                UPDATE {table_variants}
8583                SET "INFO" = 
8584                    concat(
8585                        CASE
8586                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8587                            THEN ''
8588                            ELSE concat("INFO", ';')
8589                        END,
8590                        CASE
8591                            WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.')
8592                            AND dataframe_barcode."{barcode_infos}" NOT NULL
8593                            THEN concat(
8594                                    '{tag}=',
8595                                    dataframe_barcode."{barcode_infos}"
8596                                )
8597                            ELSE ''
8598                        END
8599                    )
8600                FROM dataframe_barcode
8601                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
8602            """
8603            self.conn.execute(sql_update)
8604
8605            # Remove added columns
8606            for added_column in added_columns:
8607                self.drop_column(column=added_column)
8608
8609            # Delete dataframe
8610            del dataframe_barcode
8611            gc.collect()
8612
8613    def calculation_barcode_family(self, tag: str = "BCF") -> None:
8614        """
8615        The `calculation_barcode_family` function calculates barcode values for variants in a VCF file
8616        and updates the INFO field in the file with the calculated barcode values.
8617
8618        :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify
8619        the barcode tag that will be added to the VCF file during the calculation process. If no value
8620        is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF
8621        :type tag: str (optional)
8622        """
8623
8624        # if FORMAT and samples
8625        if (
8626            "FORMAT" in self.get_header_columns_as_list()
8627            and self.get_header_sample_list()
8628        ):
8629
8630            # barcode annotation field
8631            if not tag:
8632                tag = "BCF"
8633
8634            # VCF infos tags
8635            vcf_infos_tags = {
8636                tag: "barcode family calculation",
8637                f"{tag}S": "barcode family samples",
8638            }
8639
8640            # Param
8641            param = self.get_param()
8642            log.debug(f"param={param}")
8643
8644            # Prefix
8645            prefix = self.get_explode_infos_prefix()
8646
8647            # PED param
8648            ped = (
8649                param.get("calculation", {})
8650                .get("calculations", {})
8651                .get("BARCODEFAMILY", {})
8652                .get("family_pedigree", None)
8653            )
8654            log.debug(f"ped={ped}")
8655
8656            # Load PED
8657            if ped:
8658
8659                # Pedigree is a file
8660                if isinstance(ped, str) and os.path.exists(full_path(ped)):
8661                    log.debug("Pedigree is file")
8662                    with open(full_path(ped)) as ped:
8663                        ped = json.load(ped)
8664
8665                # Pedigree is a string
8666                elif isinstance(ped, str):
8667                    log.debug("Pedigree is str")
8668                    try:
8669                        ped = json.loads(ped)
8670                        log.debug("Pedigree is json str")
8671                    except ValueError as e:
8672                        ped_samples = ped.split(",")
8673                        ped = {}
8674                        for ped_sample in ped_samples:
8675                            ped[ped_sample] = ped_sample
8676
8677                # Pedigree is a dict
8678                elif isinstance(ped, dict):
8679                    log.debug("Pedigree is dict")
8680
8681                # Pedigree is not well formatted
8682                else:
8683                    msg_error = "Pedigree not well formatted"
8684                    log.error(msg_error)
8685                    raise ValueError(msg_error)
8686
8687                # Construct list
8688                ped_samples = list(ped.values())
8689
8690            else:
8691                log.debug("Pedigree not defined. Take all samples")
8692                ped_samples = self.get_header_sample_list()
8693                ped = {}
8694                for ped_sample in ped_samples:
8695                    ped[ped_sample] = ped_sample
8696
8697            # Check pedigree
8698            if not ped or len(ped) == 0:
8699                msg_error = f"Error in pedigree: samples {ped_samples}"
8700                log.error(msg_error)
8701                raise ValueError(msg_error)
8702
8703            # Log
8704            log.info(
8705                "Calculation 'BARCODEFAMILY' - Samples: "
8706                + ", ".join([f"{member}='{ped[member]}'" for member in ped])
8707            )
8708            log.debug(f"ped_samples={ped_samples}")
8709
8710            # Field
8711            barcode_infos = prefix + tag
8712
8713            # Variants table
8714            table_variants = self.get_table_variants()
8715
8716            # Header
8717            vcf_reader = self.get_header()
8718
8719            # Create variant id
8720            variant_id_column = self.get_variant_id_column()
8721            added_columns = [variant_id_column]
8722
8723            # variant_id, FORMAT and samples
8724            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8725                ped_samples
8726            )
8727
8728            # Create dataframe
8729            dataframe_barcode = self.get_query_to_df(
8730                f""" SELECT {samples_fields} FROM {table_variants} """
8731            )
8732
8733            # Create barcode column
8734            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
8735                lambda row: barcode(row, samples=ped_samples), axis=1
8736            )
8737
8738            # Add barcode family to header
8739            # Add vaf_normalization to header
8740            vcf_reader.formats[tag] = vcf.parser._Format(
8741                id=tag,
8742                num=".",
8743                type="String",
8744                desc=vcf_infos_tags.get(tag, "barcode family calculation"),
8745                type_code=self.code_type_map.get("String"),
8746            )
8747            vcf_reader.formats[f"{tag}S"] = vcf.parser._Format(
8748                id=f"{tag}S",
8749                num=".",
8750                type="String",
8751                desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"),
8752                type_code=self.code_type_map.get("String"),
8753            )
8754
8755            # Update
8756            # for sample in ped_samples:
8757            sql_update_set = []
8758            for sample in self.get_header_sample_list() + ["FORMAT"]:
8759                if sample in ped_samples:
8760                    value = f'dataframe_barcode."{barcode_infos}"'
8761                    value_samples = "'" + ",".join(ped_samples) + "'"
8762                elif sample == "FORMAT":
8763                    value = f"'{tag}'"
8764                    value_samples = f"'{tag}S'"
8765                else:
8766                    value = "'.'"
8767                    value_samples = "'.'"
8768                format_regex = r"[a-zA-Z0-9\s]"
8769                sql_update_set.append(
8770                    f"""
8771                        "{sample}" = 
8772                        concat(
8773                            CASE
8774                                WHEN {table_variants}."{sample}" = './.'
8775                                THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g'))
8776                                ELSE {table_variants}."{sample}"
8777                            END,
8778                            ':',
8779                            {value},
8780                            ':',
8781                            {value_samples}
8782                        )
8783                    """
8784                )
8785
8786            sql_update_set_join = ", ".join(sql_update_set)
8787            sql_update = f"""
8788                UPDATE {table_variants}
8789                SET {sql_update_set_join}
8790                FROM dataframe_barcode
8791                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
8792            """
8793            self.conn.execute(sql_update)
8794
8795            # Remove added columns
8796            for added_column in added_columns:
8797                self.drop_column(column=added_column)
8798
8799            # Delete dataframe
8800            del dataframe_barcode
8801            gc.collect()
8802
8803    def calculation_trio(self) -> None:
8804        """
8805        The `calculation_trio` function performs trio calculations on a VCF file by adding trio
8806        information to the INFO field of each variant.
8807        """
8808
8809        # if FORMAT and samples
8810        if (
8811            "FORMAT" in self.get_header_columns_as_list()
8812            and self.get_header_sample_list()
8813        ):
8814
8815            # trio annotation field
8816            trio_tag = "trio"
8817
8818            # VCF infos tags
8819            vcf_infos_tags = {
8820                "trio": "trio calculation",
8821            }
8822
8823            # Param
8824            param = self.get_param()
8825
8826            # Prefix
8827            prefix = self.get_explode_infos_prefix()
8828
8829            # Trio param
8830            trio_ped = (
8831                param.get("calculation", {})
8832                .get("calculations", {})
8833                .get("TRIO", {})
8834                .get("trio_pedigree", None)
8835            )
8836
8837            # Load trio
8838            if trio_ped:
8839
8840                # Trio pedigree is a file
8841                if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)):
8842                    log.debug("TRIO pedigree is file")
8843                    with open(full_path(trio_ped)) as trio_ped:
8844                        trio_ped = json.load(trio_ped)
8845
8846                # Trio pedigree is a string
8847                elif isinstance(trio_ped, str):
8848                    log.debug("TRIO pedigree is str")
8849                    try:
8850                        trio_ped = json.loads(trio_ped)
8851                        log.debug("TRIO pedigree is json str")
8852                    except ValueError as e:
8853                        trio_samples = trio_ped.split(",")
8854                        if len(trio_samples) == 3:
8855                            trio_ped = {
8856                                "father": trio_samples[0],
8857                                "mother": trio_samples[1],
8858                                "child": trio_samples[2],
8859                            }
8860                            log.debug("TRIO pedigree is list str")
8861                        else:
8862                            msg_error = "TRIO pedigree not well formatted"
8863                            log.error(msg_error)
8864                            raise ValueError(msg_error)
8865
8866                # Trio pedigree is a dict
8867                elif isinstance(trio_ped, dict):
8868                    log.debug("TRIO pedigree is dict")
8869
8870                # Trio pedigree is not well formatted
8871                else:
8872                    msg_error = "TRIO pedigree not well formatted"
8873                    log.error(msg_error)
8874                    raise ValueError(msg_error)
8875
8876                # Construct trio list
8877                trio_samples = [
8878                    trio_ped.get("father", ""),
8879                    trio_ped.get("mother", ""),
8880                    trio_ped.get("child", ""),
8881                ]
8882
8883            else:
8884                log.debug("TRIO pedigree not defined. Take the first 3 samples")
8885                samples_list = self.get_header_sample_list()
8886                if len(samples_list) >= 3:
8887                    trio_samples = self.get_header_sample_list()[0:3]
8888                    trio_ped = {
8889                        "father": trio_samples[0],
8890                        "mother": trio_samples[1],
8891                        "child": trio_samples[2],
8892                    }
8893                else:
8894                    msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}"
8895                    log.error(msg_error)
8896                    raise ValueError(msg_error)
8897
8898            # Check trio pedigree
8899            if not trio_ped or len(trio_ped) != 3:
8900                msg_error = f"Error in TRIO pedigree: {trio_ped}"
8901                log.error(msg_error)
8902                raise ValueError(msg_error)
8903
8904            # Log
8905            log.info(
8906                f"Calculation 'TRIO' - Samples: "
8907                + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped])
8908            )
8909
8910            # Field
8911            trio_infos = prefix + trio_tag
8912
8913            # Variants table
8914            table_variants = self.get_table_variants()
8915
8916            # Header
8917            vcf_reader = self.get_header()
8918
8919            # Create variant id
8920            variant_id_column = self.get_variant_id_column()
8921            added_columns = [variant_id_column]
8922
8923            # variant_id, FORMAT and samples
8924            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8925                self.get_header_sample_list()
8926            )
8927
8928            # Create dataframe
8929            dataframe_trio = self.get_query_to_df(
8930                f""" SELECT {samples_fields} FROM {table_variants} """
8931            )
8932
8933            # Create trio column
8934            dataframe_trio[trio_infos] = dataframe_trio.apply(
8935                lambda row: trio(row, samples=trio_samples), axis=1
8936            )
8937
8938            # Add trio to header
8939            vcf_reader.infos[trio_tag] = vcf.parser._Info(
8940                trio_tag,
8941                ".",
8942                "String",
8943                vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"),
8944                "howard calculation",
8945                "0",
8946                self.code_type_map.get("String"),
8947            )
8948
8949            # Update
8950            sql_update = f"""
8951                UPDATE {table_variants}
8952                SET "INFO" = 
8953                    concat(
8954                        CASE
8955                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8956                            THEN ''
8957                            ELSE concat("INFO", ';')
8958                        END,
8959                        CASE
8960                            WHEN dataframe_trio."{trio_infos}" NOT IN ('','.')
8961                             AND dataframe_trio."{trio_infos}" NOT NULL
8962                            THEN concat(
8963                                    '{trio_tag}=',
8964                                    dataframe_trio."{trio_infos}"
8965                                )
8966                            ELSE ''
8967                        END
8968                    )
8969                FROM dataframe_trio
8970                WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}"
8971            """
8972            self.conn.execute(sql_update)
8973
8974            # Remove added columns
8975            for added_column in added_columns:
8976                self.drop_column(column=added_column)
8977
8978            # Delete dataframe
8979            del dataframe_trio
8980            gc.collect()
8981
8982    def calculation_vaf_normalization(self) -> None:
8983        """
8984        The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency)
8985        normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.
8986        :return: The function does not return anything.
8987        """
8988
8989        # if FORMAT and samples
8990        if (
8991            "FORMAT" in self.get_header_columns_as_list()
8992            and self.get_header_sample_list()
8993        ):
8994
8995            # vaf_normalization annotation field
8996            vaf_normalization_tag = "VAF"
8997
8998            # VCF infos tags
8999            vcf_infos_tags = {
9000                "VAF": "VAF Variant Frequency",
9001            }
9002
9003            # Prefix
9004            prefix = self.get_explode_infos_prefix()
9005
9006            # Variants table
9007            table_variants = self.get_table_variants()
9008
9009            # Header
9010            vcf_reader = self.get_header()
9011
9012            # Do not calculate if VAF already exists
9013            if "VAF" in vcf_reader.formats:
9014                log.debug("VAF already on genotypes")
9015                return
9016
9017            # Create variant id
9018            variant_id_column = self.get_variant_id_column()
9019            added_columns = [variant_id_column]
9020
9021            # variant_id, FORMAT and samples
9022            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9023                f""" "{sample}" """ for sample in self.get_header_sample_list()
9024            )
9025
9026            # Create dataframe
9027            query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """
9028            log.debug(f"query={query}")
9029            dataframe_vaf_normalization = self.get_query_to_df(query=query)
9030
9031            vaf_normalization_set = []
9032
9033            # for each sample vaf_normalization
9034            for sample in self.get_header_sample_list():
9035                dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply(
9036                    lambda row: vaf_normalization(row, sample=sample), axis=1
9037                )
9038                vaf_normalization_set.append(
9039                    f""" "{sample}" = dataframe_vaf_normalization."{sample}" """
9040                )
9041
9042            # Add VAF to FORMAT
9043            dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[
9044                "FORMAT"
9045            ].apply(lambda x: str(x) + ":VAF")
9046            vaf_normalization_set.append(
9047                f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """
9048            )
9049
9050            # Add vaf_normalization to header
9051            vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format(
9052                id=vaf_normalization_tag,
9053                num="1",
9054                type="Float",
9055                desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"),
9056                type_code=self.code_type_map.get("Float"),
9057            )
9058
9059            # Create fields to add in INFO
9060            sql_vaf_normalization_set = " , ".join(vaf_normalization_set)
9061
9062            # Update
9063            sql_update = f"""
9064                UPDATE {table_variants}
9065                SET {sql_vaf_normalization_set}
9066                FROM dataframe_vaf_normalization
9067                WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}"
9068
9069            """
9070            self.conn.execute(sql_update)
9071
9072            # Remove added columns
9073            for added_column in added_columns:
9074                self.drop_column(column=added_column)
9075
9076            # Delete dataframe
9077            del dataframe_vaf_normalization
9078            gc.collect()
9079
9080    def calculation_genotype_stats(self, info: str = "VAF") -> None:
9081        """
9082        The `calculation_genotype_stats` function calculates genotype statistics for a given information
9083        field in a VCF file and updates the INFO column of the variants table with the calculated
9084        statistics.
9085
9086        :param info: The `info` parameter is a string that represents the type of information for which
9087        genotype statistics are calculated. It is used to generate various VCF info tags for the
9088        statistics, such as the number of occurrences, the list of values, the minimum value, the
9089        maximum value, the mean, the median, defaults to VAF
9090        :type info: str (optional)
9091        """
9092
9093        # if FORMAT and samples
9094        if (
9095            "FORMAT" in self.get_header_columns_as_list()
9096            and self.get_header_sample_list()
9097        ):
9098
9099            # vaf_stats annotation field
9100            vaf_stats_tag = info + "_stats"
9101
9102            # VCF infos tags
9103            vcf_infos_tags = {
9104                info + "_stats_nb": f"genotype {info} Statistics - number of {info}",
9105                info + "_stats_list": f"genotype {info} Statistics - list of {info}",
9106                info + "_stats_min": f"genotype {info} Statistics - min {info}",
9107                info + "_stats_max": f"genotype {info} Statistics - max {info}",
9108                info + "_stats_mean": f"genotype {info} Statistics - mean {info}",
9109                info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}",
9110                info
9111                + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}",
9112            }
9113
9114            # Prefix
9115            prefix = self.get_explode_infos_prefix()
9116
9117            # Field
9118            vaf_stats_infos = prefix + vaf_stats_tag
9119
9120            # Variants table
9121            table_variants = self.get_table_variants()
9122
9123            # Header
9124            vcf_reader = self.get_header()
9125
9126            # Create variant id
9127            variant_id_column = self.get_variant_id_column()
9128            added_columns = [variant_id_column]
9129
9130            # variant_id, FORMAT and samples
9131            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9132                self.get_header_sample_list()
9133            )
9134
9135            # Create dataframe
9136            dataframe_vaf_stats = self.get_query_to_df(
9137                f""" SELECT {samples_fields} FROM {table_variants} """
9138            )
9139
9140            # Create vaf_stats column
9141            dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply(
9142                lambda row: genotype_stats(
9143                    row, samples=self.get_header_sample_list(), info=info
9144                ),
9145                axis=1,
9146            )
9147
9148            # List of vcf tags
9149            sql_vaf_stats_fields = []
9150
9151            # Check all VAF stats infos
9152            for stat in vcf_infos_tags:
9153
9154                # Extract stats
9155                dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply(
9156                    lambda x: dict(x).get(stat, "")
9157                )
9158
9159                # Add snpeff_hgvs to header
9160                vcf_reader.infos[stat] = vcf.parser._Info(
9161                    stat,
9162                    ".",
9163                    "String",
9164                    vcf_infos_tags.get(stat, "genotype statistics"),
9165                    "howard calculation",
9166                    "0",
9167                    self.code_type_map.get("String"),
9168                )
9169
9170                if len(sql_vaf_stats_fields):
9171                    sep = ";"
9172                else:
9173                    sep = ""
9174
9175                # Create fields to add in INFO
9176                sql_vaf_stats_fields.append(
9177                    f"""
9178                        CASE
9179                            WHEN dataframe_vaf_stats."{stat}" NOT NULL
9180                            THEN concat(
9181                                    '{sep}{stat}=',
9182                                    dataframe_vaf_stats."{stat}"
9183                                )
9184                            ELSE ''
9185                        END
9186                    """
9187                )
9188
9189            # SQL set for update
9190            sql_vaf_stats_fields_set = ",  ".join(sql_vaf_stats_fields)
9191
9192            # Update
9193            sql_update = f"""
9194                UPDATE {table_variants}
9195                SET "INFO" = 
9196                    concat(
9197                        CASE
9198                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
9199                            THEN ''
9200                            ELSE concat("INFO", ';')
9201                        END,
9202                        {sql_vaf_stats_fields_set}
9203                    )
9204                FROM dataframe_vaf_stats
9205                WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}"
9206
9207            """
9208            self.conn.execute(sql_update)
9209
9210            # Remove added columns
9211            for added_column in added_columns:
9212                self.drop_column(column=added_column)
9213
9214            # Delete dataframe
9215            del dataframe_vaf_stats
9216            gc.collect()
9217
9218    def calculation_transcripts_json(self, info: str = "transcripts_json") -> None:
9219        """
9220        The function `calculation_transcripts_json` creates a transcripts table and adds an info field
9221        to it if transcripts are available.
9222
9223        :param info: The `info` parameter in the `calculation_transcripts_json` method is a string
9224        parameter that specifies the information field to be used in the transcripts JSON. It has a
9225        default value of "transcripts_json" if no value is provided when calling the method, defaults to
9226        transcripts_json
9227        :type info: str (optional)
9228        """
9229
9230        # Create transcripts table
9231        transcripts_table = self.create_transcript_view()
9232
9233        # Add info field
9234        if transcripts_table:
9235            self.transcript_view_to_variants(
9236                transcripts_table=transcripts_table, transcripts_info_field=info
9237            )
9238        else:
9239            log.info("No Transcripts to process. Check param.json file configuration")
9240
9241    ###############
9242    # Transcripts #
9243    ###############
9244
9245    def create_transcript_view_from_columns_map(
9246        self,
9247        transcripts_table: str = "transcripts",
9248        columns_maps: dict = {},
9249        added_columns: list = [],
9250        temporary_tables: list = None,
9251        annotation_fields: list = None,
9252    ) -> tuple[list, list, list]:
9253        """
9254        The `create_transcript_view_from_columns_map` function generates a temporary table view based on
9255        specified columns mapping for transcripts data.
9256
9257        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of
9258        the table where the transcripts data is stored or will be stored in the database. This table
9259        typically contains information about transcripts such as Ensembl transcript IDs, gene names, scores,
9260        predictions, etc. It defaults to "transcripts, defaults to transcripts
9261        :type transcripts_table: str (optional)
9262        :param columns_maps: The `columns_maps` parameter is a dictionary that contains information about
9263        how to map columns from a transcripts table to create a view. Each entry in the `columns_maps` list
9264        represents a mapping configuration for a specific set of columns. It typically includes details such
9265        as the main transcript column and additional information columns
9266        :type columns_maps: dict
9267        :param added_columns: The `added_columns` parameter in the `create_transcript_view_from_columns_map`
9268        function is a list that stores the additional columns that will be added to the view being created
9269        based on the columns map provided. These columns are generated by exploding the transcript
9270        information columns along with the main transcript column
9271        :type added_columns: list
9272        :param temporary_tables: The `temporary_tables` parameter in the
9273        `create_transcript_view_from_columns_map` function is a list that stores the names of temporary
9274        tables created during the process of creating a transcript view from a columns map. These temporary
9275        tables are used to store intermediate results or transformations before the final view is generated
9276        :type temporary_tables: list
9277        :param annotation_fields: The `annotation_fields` parameter in the
9278        `create_transcript_view_from_columns_map` function is a list that stores the fields that are used
9279        for annotation in the query view creation process. These fields are extracted from the
9280        `transcripts_column` and `transcripts_infos_columns` specified in the `columns
9281        :type annotation_fields: list
9282        :return: The function `create_transcript_view_from_columns_map` returns a tuple containing three
9283        lists: `added_columns`, `temporary_tables`, and `annotation_fields`.
9284        """
9285
9286        log.debug("Start transcrpts view creation from columns map...")
9287
9288        # "from_columns_map": [
9289        #     {
9290        #         "transcripts_column": "Ensembl_transcriptid",
9291        #         "transcripts_infos_columns": [
9292        #             "genename",
9293        #             "Ensembl_geneid",
9294        #             "LIST_S2_score",
9295        #             "LIST_S2_pred",
9296        #         ],
9297        #     },
9298        #     {
9299        #         "transcripts_column": "Ensembl_transcriptid",
9300        #         "transcripts_infos_columns": [
9301        #             "genename",
9302        #             "VARITY_R_score",
9303        #             "Aloft_pred",
9304        #         ],
9305        #     },
9306        # ],
9307
9308        # Init
9309        if temporary_tables is None:
9310            temporary_tables = []
9311        if annotation_fields is None:
9312            annotation_fields = []
9313
9314        # Variants table
9315        table_variants = self.get_table_variants()
9316
9317        for columns_map in columns_maps:
9318
9319            # Transcript column
9320            transcripts_column = columns_map.get("transcripts_column", None)
9321
9322            # Transcripts infos columns
9323            transcripts_infos_columns = columns_map.get("transcripts_infos_columns", [])
9324
9325            if transcripts_column is not None:
9326
9327                # Explode
9328                added_columns += self.explode_infos(
9329                    fields=[transcripts_column] + transcripts_infos_columns
9330                )
9331
9332                # View clauses
9333                clause_select = []
9334                for field in [transcripts_column] + transcripts_infos_columns:
9335                    clause_select.append(
9336                        f""" regexp_split_to_table("{field}", ',') AS '{field}' """
9337                    )
9338                    if field not in [transcripts_column]:
9339                        annotation_fields.append(field)
9340
9341                # Querey View
9342                query = f""" 
9343                    SELECT
9344                        "#CHROM", POS, REF, ALT,
9345                        "{transcripts_column}" AS 'transcript',
9346                        {", ".join(clause_select)}
9347                    FROM (
9348                        SELECT 
9349                            "#CHROM", POS, REF, ALT,
9350                            {", ".join(clause_select)}
9351                        FROM {table_variants}
9352                        )
9353                    WHERE "{transcripts_column}" IS NOT NULL
9354                """
9355
9356                # Create temporary table
9357                temporary_table = transcripts_table + "".join(
9358                    random.choices(string.ascii_uppercase + string.digits, k=10)
9359                )
9360
9361                # Temporary_tables
9362                temporary_tables.append(temporary_table)
9363                query_view = f"""
9364                    CREATE TEMPORARY TABLE {temporary_table}
9365                    AS ({query})
9366                """
9367                self.execute_query(query=query_view)
9368
9369        return added_columns, temporary_tables, annotation_fields
9370
9371    def create_transcript_view_from_column_format(
9372        self,
9373        transcripts_table: str = "transcripts",
9374        column_formats: dict = {},
9375        temporary_tables: list = None,
9376        annotation_fields: list = None,
9377    ) -> tuple[list, list, list]:
9378        """
9379        The `create_transcript_view_from_column_format` function generates a transcript view based on
9380        specified column formats, adds additional columns and annotation fields, and returns the list of
9381        temporary tables and annotation fields.
9382
9383        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of
9384        the table containing the transcripts data. This table will be used as the base table for creating
9385        the transcript view. The default value for this parameter is "transcripts", but you can provide a
9386        different table name if needed, defaults to transcripts
9387        :type transcripts_table: str (optional)
9388        :param column_formats: The `column_formats` parameter is a dictionary that contains information
9389        about the columns to be used for creating the transcript view. Each entry in the dictionary
9390        specifies the mapping between a transcripts column and a transcripts infos column. For example, in
9391        the provided code snippet:
9392        :type column_formats: dict
9393        :param temporary_tables: The `temporary_tables` parameter in the
9394        `create_transcript_view_from_column_format` function is a list that stores the names of temporary
9395        views created during the process of creating a transcript view from a column format. These temporary
9396        views are used to manipulate and extract data before generating the final transcript view. It
9397        :type temporary_tables: list
9398        :param annotation_fields: The `annotation_fields` parameter in the
9399        `create_transcript_view_from_column_format` function is a list that stores the annotation fields
9400        that are extracted from the temporary views created during the process. These annotation fields are
9401        obtained by querying the temporary views and extracting the column names excluding specific columns
9402        like `#CH
9403        :type annotation_fields: list
9404        :return: The `create_transcript_view_from_column_format` function returns two lists:
9405        `temporary_tables` and `annotation_fields`.
9406        """
9407
9408        log.debug("Start transcrpts view creation from column format...")
9409
9410        #  "from_column_format": [
9411        #     {
9412        #         "transcripts_column": "ANN",
9413        #         "transcripts_infos_column": "Feature_ID",
9414        #     }
9415        # ],
9416
9417        # Init
9418        if temporary_tables is None:
9419            temporary_tables = []
9420        if annotation_fields is None:
9421            annotation_fields = []
9422
9423        for column_format in column_formats:
9424
9425            # annotation field and transcript annotation field
9426            annotation_field = column_format.get("transcripts_column", "ANN")
9427            transcript_annotation = column_format.get(
9428                "transcripts_infos_column", "Feature_ID"
9429            )
9430
9431            # Temporary View name
9432            temporary_view_name = transcripts_table + "".join(
9433                random.choices(string.ascii_uppercase + string.digits, k=10)
9434            )
9435
9436            # Create temporary view name
9437            temporary_view_name = self.annotation_format_to_table(
9438                uniquify=True,
9439                annotation_field=annotation_field,
9440                view_name=temporary_view_name,
9441                annotation_id=transcript_annotation,
9442            )
9443
9444            # Annotation fields
9445            if temporary_view_name:
9446                query_annotation_fields = f"""
9447                    SELECT *
9448                    FROM (
9449                        DESCRIBE SELECT *
9450                        FROM {temporary_view_name}
9451                        )
9452                        WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT')
9453                """
9454                df_annotation_fields = self.get_query_to_df(
9455                    query=query_annotation_fields
9456                )
9457
9458                # Add temporary view and annotation fields
9459                temporary_tables.append(temporary_view_name)
9460                annotation_fields += list(set(df_annotation_fields["column_name"]))
9461
9462        return temporary_tables, annotation_fields
9463
9464    def create_transcript_view(
9465        self,
9466        transcripts_table: str = None,
9467        transcripts_table_drop: bool = True,
9468        param: dict = {},
9469    ) -> str:
9470        """
9471        The `create_transcript_view` function generates a transcript view by processing data from a
9472        specified table based on provided parameters and structural information.
9473
9474        :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function
9475        is used to specify the name of the table that will store the final transcript view data. If a table
9476        name is not provided, the function will create a new table to store the transcript view data, and by
9477        default,, defaults to transcripts
9478        :type transcripts_table: str (optional)
9479        :param transcripts_table_drop: The `transcripts_table_drop` parameter in the
9480        `create_transcript_view` function is a boolean parameter that determines whether to drop the
9481        existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`,
9482        the function will drop the existing transcripts table if it exists, defaults to True
9483        :type transcripts_table_drop: bool (optional)
9484        :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that
9485        contains information needed to create a transcript view. It includes details such as the structure
9486        of the transcripts, columns mapping, column formats, and other necessary information for generating
9487        the view. This parameter allows for flexibility and customization
9488        :type param: dict
9489        :return: The `create_transcript_view` function returns the name of the transcripts table that was
9490        created or modified during the execution of the function.
9491        """
9492
9493        log.debug("Start transcrpts view creation...")
9494
9495        # Default
9496        transcripts_table_default = "transcripts"
9497
9498        # Param
9499        if not param:
9500            param = self.get_param()
9501
9502        # Struct
9503        struct = param.get("transcripts", {}).get("struct", None)
9504
9505        if struct:
9506
9507            # Transcripts table
9508            if transcripts_table is None:
9509                transcripts_table = param.get("transcripts", {}).get(
9510                    "table", transcripts_table_default
9511                )
9512
9513            # added_columns
9514            added_columns = []
9515
9516            # Temporary tables
9517            temporary_tables = []
9518
9519            # Annotation fields
9520            annotation_fields = []
9521
9522            # from columns map
9523            columns_maps = struct.get("from_columns_map", [])
9524            added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = (
9525                self.create_transcript_view_from_columns_map(
9526                    transcripts_table=transcripts_table,
9527                    columns_maps=columns_maps,
9528                    added_columns=added_columns,
9529                    temporary_tables=temporary_tables,
9530                    annotation_fields=annotation_fields,
9531                )
9532            )
9533            added_columns += added_columns_tmp
9534            temporary_tables += temporary_tables_tmp
9535            annotation_fields += annotation_fields_tmp
9536
9537            # from column format
9538            column_formats = struct.get("from_column_format", [])
9539            temporary_tables_tmp, annotation_fields_tmp = (
9540                self.create_transcript_view_from_column_format(
9541                    transcripts_table=transcripts_table,
9542                    column_formats=column_formats,
9543                    temporary_tables=temporary_tables,
9544                    annotation_fields=annotation_fields,
9545                )
9546            )
9547            temporary_tables += temporary_tables_tmp
9548            annotation_fields += annotation_fields_tmp
9549
9550            # Merge temporary tables query
9551            query_merge = ""
9552            for temporary_table in temporary_tables:
9553
9554                # First temporary table
9555                if not query_merge:
9556                    query_merge = f"""
9557                        SELECT * FROM {temporary_table}
9558                    """
9559                # other temporary table (using UNION)
9560                else:
9561                    query_merge += f"""
9562                        UNION BY NAME SELECT * FROM {temporary_table}
9563                    """
9564
9565            # Merge on transcript
9566            query_merge_on_transcripts_annotation_fields = []
9567            # Aggregate all annotations fields
9568            for annotation_field in set(annotation_fields):
9569                query_merge_on_transcripts_annotation_fields.append(
9570                    f""" list_aggregate(list_distinct(array_agg({annotation_field})), 'string_agg', ',') AS {annotation_field} """
9571                )
9572            # Query for transcripts view
9573            query_merge_on_transcripts = f"""
9574                SELECT "#CHROM", POS, REF, ALT, transcript, {", ".join(query_merge_on_transcripts_annotation_fields)}
9575                FROM ({query_merge})
9576                GROUP BY "#CHROM", POS, REF, ALT, transcript
9577            """
9578
9579            # Drop transcript view is necessary
9580            if transcripts_table_drop:
9581                query_drop = f"""
9582                    DROP TABLE IF EXISTS {transcripts_table};
9583                """
9584                self.execute_query(query=query_drop)
9585
9586            # Merge and create transcript view
9587            query_create_view = f"""
9588                CREATE TABLE IF NOT EXISTS {transcripts_table}
9589                AS {query_merge_on_transcripts}
9590            """
9591            self.execute_query(query=query_create_view)
9592
9593            # Remove added columns
9594            for added_column in added_columns:
9595                self.drop_column(column=added_column)
9596
9597        else:
9598
9599            transcripts_table = None
9600
9601        return transcripts_table
9602
9603    def annotation_format_to_table(
9604        self,
9605        uniquify: bool = True,
9606        annotation_field: str = "ANN",
9607        annotation_id: str = "Feature_ID",
9608        view_name: str = "transcripts",
9609    ) -> str:
9610        """
9611        The function `annotation_format_to_table` converts annotation data from a VCF file into a structured
9612        table format.
9613
9614        :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure unique
9615        values in the output or not. If set to `True`, the function will make sure that the output values
9616        are unique, defaults to True
9617        :type uniquify: bool (optional)
9618        :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file that
9619        contains the annotation information for each variant. This field is used to extract the annotation
9620        details for further processing in the function, defaults to ANN
9621        :type annotation_field: str (optional)
9622        :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method is
9623        used to specify the identifier for the annotation feature. This identifier will be used as a column
9624        name in the resulting table or view that is created based on the annotation data. It helps in
9625        uniquely identifying each annotation entry in the, defaults to Feature_ID
9626        :type annotation_id: str (optional)
9627        :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used to
9628        specify the name of the temporary table that will be created to store the transformed annotation
9629        data. This table will hold the extracted information from the annotation field in a structured
9630        format for further processing or analysis, defaults to transcripts
9631        :type view_name: str (optional)
9632        :return: The function `annotation_format_to_table` is returning the name of the view created, which
9633        is stored in the variable `view_name`.
9634        """
9635
9636        # Annotation field
9637        annotation_format = "annotation_explode"
9638
9639        # Transcript annotation
9640        annotation_id = "".join(char for char in annotation_id if char.isalnum())
9641
9642        # Prefix
9643        prefix = self.get_explode_infos_prefix()
9644        if prefix:
9645            prefix = "INFO/"
9646
9647        # Annotation fields
9648        annotation_infos = prefix + annotation_field
9649        annotation_format_infos = prefix + annotation_format
9650
9651        # Variants table
9652        table_variants = self.get_table_variants()
9653
9654        # Header
9655        vcf_reader = self.get_header()
9656
9657        # Add columns
9658        added_columns = []
9659
9660        # Explode HGVS field in column
9661        added_columns += self.explode_infos(fields=[annotation_field])
9662
9663        if annotation_field in vcf_reader.infos:
9664
9665            # Extract ANN header
9666            ann_description = vcf_reader.infos[annotation_field].desc
9667            pattern = r"'(.+?)'"
9668            match = re.search(pattern, ann_description)
9669            if match:
9670                ann_header_match = match.group(1).split(" | ")
9671                ann_header = []
9672                ann_header_desc = {}
9673                for i in range(len(ann_header_match)):
9674                    ann_header_info = "".join(
9675                        char for char in ann_header_match[i] if char.isalnum()
9676                    )
9677                    ann_header.append(ann_header_info)
9678                    ann_header_desc[ann_header_info] = ann_header_match[i]
9679                if not ann_header_desc:
9680                    raise ValueError("Invalid header description format")
9681            else:
9682                raise ValueError("Invalid header description format")
9683
9684            # Create variant id
9685            variant_id_column = self.get_variant_id_column()
9686            added_columns += [variant_id_column]
9687
9688            # Create dataframe
9689            dataframe_annotation_format = self.get_query_to_df(
9690                f""" SELECT "#CHROM", POS, REF, ALT, "{variant_id_column}", "{annotation_infos}" FROM {table_variants} """
9691            )
9692
9693            # Create annotation columns
9694            dataframe_annotation_format[
9695                annotation_format_infos
9696            ] = dataframe_annotation_format[annotation_infos].apply(
9697                lambda x: explode_annotation_format(
9698                    annotation=str(x),
9699                    uniquify=uniquify,
9700                    output_format="JSON",
9701                    prefix="",
9702                    header=list(ann_header_desc.values()),
9703                )
9704            )
9705
9706            # Find keys
9707            query_json = f"""SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' FROM dataframe_annotation_format;"""
9708            df_keys = self.get_query_to_df(query=query_json)
9709
9710            # Check keys
9711            query_json_key = []
9712            for _, row in df_keys.iterrows():
9713
9714                # Key
9715                key = row.iloc[0]
9716
9717                # key_clean
9718                key_clean = "".join(char for char in key if char.isalnum())
9719
9720                # Type
9721                query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');"""
9722
9723                # Get DataFrame from query
9724                df_json_type = self.get_query_to_df(query=query_json_type)
9725
9726                # Fill missing values with empty strings and then replace empty strings or None with NaN and drop rows with NaN
9727                with pd.option_context("future.no_silent_downcasting", True):
9728                    df_json_type.fillna(value="", inplace=True)
9729                    replace_dict = {None: np.nan, "": np.nan}
9730                    df_json_type.replace(replace_dict, inplace=True)
9731                    df_json_type.dropna(inplace=True)
9732
9733                # Detect column type
9734                column_type = detect_column_type(df_json_type[key_clean])
9735
9736                # Append
9737                query_json_key.append(
9738                    f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type}  AS '{prefix}{key_clean}' """
9739                )
9740
9741            # Create view
9742            query_view = f"""CREATE TEMPORARY TABLE {view_name} AS (SELECT *, {annotation_id} AS 'transcript' FROM (SELECT "#CHROM", POS, REF, ALT, {",".join(query_json_key)} FROM dataframe_annotation_format));"""
9743            self.execute_query(query=query_view)
9744
9745        else:
9746
9747            # Return None
9748            view_name = None
9749
9750        # Remove added columns
9751        for added_column in added_columns:
9752            self.drop_column(column=added_column)
9753
9754        return view_name
9755
9756    def transcript_view_to_variants(
9757        self,
9758        transcripts_table: str = None,
9759        transcripts_column_id: str = None,
9760        transcripts_info_json: str = None,
9761        transcripts_info_field: str = None,
9762        param: dict = {},
9763    ) -> bool:
9764        """
9765        The function `transcript_view_to_variants` takes input parameters related to transcripts and updates
9766        a variants table with information from the transcripts in JSON format.
9767
9768        :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the table
9769        containing the transcripts data. If this parameter is not provided, the function will attempt to
9770        retrieve it from the `param` dictionary or use a default value of "transcripts"
9771        :type transcripts_table: str
9772        :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the column in
9773        the `transcripts_table` that contains the unique identifier for each transcript. This identifier is
9774        used to match transcripts with variants in the database
9775        :type transcripts_column_id: str
9776        :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name of
9777        the column in the variants table where the transcripts information will be stored in JSON format
9778        :type transcripts_info_json: str
9779        :param transcripts_info_field: The `transcripts_info_field` parameter is used to specify the field
9780        in the VCF header that will contain information about transcripts in JSON format. This field will be
9781        added to the VCF header as an INFO field with the specified name
9782        :type transcripts_info_field: str
9783        :param param: The `transcript_view_to_variants` method takes several parameters:
9784        :type param: dict
9785        :return: The function `transcript_view_to_variants` returns a boolean value, which is `True` if the
9786        operation is successful and `False` if certain conditions are not met.
9787        """
9788
9789        log.debug("Start transcripts view to JSON...")
9790
9791        # Default
9792        transcripts_table_default = "transcripts"
9793        transcripts_column_id_default = "transcript"
9794        transcripts_info_json_default = None
9795        transcripts_info_field_default = None
9796
9797        # Param
9798        if not param:
9799            param = self.get_param()
9800
9801        # Transcripts table
9802        if transcripts_table is None:
9803            transcripts_table = param.get("transcripts", {}).get(
9804                "table", transcripts_table_default
9805            )
9806
9807        # Transcripts column ID
9808        if transcripts_column_id is None:
9809            transcripts_column_id = param.get("transcripts", {}).get(
9810                "column_id", transcripts_column_id_default
9811            )
9812
9813        # Transcripts info field
9814        if transcripts_info_json is None:
9815            transcripts_info_json = param.get("transcripts", {}).get(
9816                "transcripts_info_json", transcripts_info_json_default
9817            )
9818
9819        # Transcripts info field
9820        if transcripts_info_field is None:
9821            transcripts_info_field = param.get("transcripts", {}).get(
9822                "transcripts_info_field", transcripts_info_field_default
9823            )
9824
9825        # Variants table
9826        table_variants = self.get_table_variants()
9827
9828        # Check info columns param
9829        if transcripts_info_json is None and transcripts_info_field is None:
9830            return False
9831
9832        # Transcripts infos columns
9833        query_transcripts_infos_columns = f"""
9834            SELECT *
9835            FROM (
9836                DESCRIBE SELECT * FROM {transcripts_table}
9837                )
9838            WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}')
9839        """
9840        transcripts_infos_columns = list(
9841            self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"]
9842        )
9843
9844        # View results
9845        clause_select = []
9846        clause_to_json = []
9847        for field in transcripts_infos_columns:
9848            clause_select.append(
9849                f""" regexp_split_to_table("{field}", ',') AS '{field}' """
9850            )
9851            clause_to_json.append(f""" '{field}': "{field}" """)
9852
9853        # Update
9854        update_set = []
9855
9856        # VCF header
9857        vcf_reader = self.get_header()
9858
9859        # Transcripts to info column in JSON
9860        if transcripts_info_json is not None:
9861
9862            # Create column on variants table
9863            self.add_column(
9864                table_name=table_variants,
9865                column_name=transcripts_info_json,
9866                column_type="JSON",
9867                default_value=None,
9868                drop=False,
9869            )
9870
9871            # Add to update
9872            update_set.append(
9873                f""" {transcripts_info_json}=t.{transcripts_info_json} """
9874            )
9875
9876            # Add header
9877            vcf_reader.infos[transcripts_info_json] = vcf.parser._Info(
9878                transcripts_info_json,
9879                ".",
9880                "String",
9881                "Transcripts in JSON format",
9882                "unknwon",
9883                "unknwon",
9884                self.code_type_map["String"],
9885            )
9886
9887        # Transcripts to info field in JSON
9888        if transcripts_info_field is not None:
9889
9890            # Add to update
9891            update_set.append(
9892                f""" 
9893                    INFO = concat(
9894                            CASE
9895                                WHEN INFO NOT IN ('', '.')
9896                                THEN INFO
9897                                ELSE ''
9898                            END,
9899                            CASE
9900                                WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.')
9901                                THEN concat(
9902                                    ';{transcripts_info_field}=',
9903                                    t.{transcripts_info_json}
9904                                )
9905                                ELSE ''
9906                            END
9907                            )
9908                """
9909            )
9910
9911            # Add header
9912            vcf_reader.infos[transcripts_info_field] = vcf.parser._Info(
9913                transcripts_info_field,
9914                ".",
9915                "String",
9916                "Transcripts in JSON format",
9917                "unknwon",
9918                "unknwon",
9919                self.code_type_map["String"],
9920            )
9921
9922        # Update query
9923        query_update = f"""
9924            UPDATE {table_variants}
9925                SET {", ".join(update_set)}
9926            FROM
9927            (
9928                SELECT
9929                    "#CHROM", POS, REF, ALT,
9930                        concat(
9931                        '{{',
9932                        string_agg(
9933                            '"' || "{transcripts_column_id}" || '":' ||
9934                            to_json(json_output)
9935                        ),
9936                        '}}'
9937                        )::JSON AS {transcripts_info_json}
9938                FROM
9939                    (
9940                    SELECT
9941                        "#CHROM", POS, REF, ALT,
9942                        "{transcripts_column_id}",
9943                        to_json(
9944                            {{{",".join(clause_to_json)}}}
9945                        )::JSON AS json_output
9946                    FROM
9947                        (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
9948                    WHERE "{transcripts_column_id}" IS NOT NULL
9949                    )
9950                GROUP BY "#CHROM", POS, REF, ALT
9951            ) AS t
9952            WHERE {table_variants}."#CHROM" = t."#CHROM"
9953                AND {table_variants}."POS" = t."POS"
9954                AND {table_variants}."REF" = t."REF"
9955                AND {table_variants}."ALT" = t."ALT"
9956        """
9957
9958        self.execute_query(query=query_update)
9959
9960        return True
Variants( conn=None, input: str = None, output: str = None, config: dict = {}, param: dict = {}, load: bool = False)
36    def __init__(
37        self,
38        conn=None,
39        input: str = None,
40        output: str = None,
41        config: dict = {},
42        param: dict = {},
43        load: bool = False,
44    ) -> None:
45        """
46        The function `__init__` initializes the variables, sets the input, output, config, param, connexion and
47        header
48
49        :param conn: the connection to the database
50        :param input: the input file
51        :param output: the output file
52        :param config: a dictionary containing the configuration of the model
53        :param param: a dictionary containing the parameters of the model
54        """
55
56        # Init variables
57        self.init_variables()
58
59        # Input
60        self.set_input(input)
61
62        # Config
63        self.set_config(config)
64
65        # Param
66        self.set_param(param)
67
68        # Output
69        self.set_output(output)
70
71        # connexion
72        self.set_connexion(conn)
73
74        # Header
75        self.set_header()
76
77        # Load data
78        if load:
79            self.load_data()

The function __init__ initializes the variables, sets the input, output, config, param, connexion and header

Parameters
  • conn: the connection to the database
  • input: the input file
  • output: the output file
  • config: a dictionary containing the configuration of the model
  • param: a dictionary containing the parameters of the model
def set_input(self, input: str = None) -> None:
 81    def set_input(self, input: str = None) -> None:
 82        """
 83        The function `set_input` takes a file name as input, extracts the name and extension, and sets
 84        attributes in the class accordingly.
 85
 86        :param input: The `set_input` method in the provided code snippet is used to set attributes
 87        related to the input file. Here's a breakdown of the parameters and their usage in the method:
 88        :type input: str
 89        """
 90
 91        if input and not isinstance(input, str):
 92            try:
 93                self.input = input.name
 94            except:
 95                log.error(f"Input file '{input} in bad format")
 96                raise ValueError(f"Input file '{input} in bad format")
 97        else:
 98            self.input = input
 99
100        # Input format
101        if input:
102            input_name, input_extension = os.path.splitext(self.input)
103            self.input_name = input_name
104            self.input_extension = input_extension
105            self.input_format = self.input_extension.replace(".", "")

The function set_input takes a file name as input, extracts the name and extension, and sets attributes in the class accordingly.

Parameters
  • input: The set_input method in the provided code snippet is used to set attributes related to the input file. Here's a breakdown of the parameters and their usage in the method:
def set_config(self, config: dict) -> None:
107    def set_config(self, config: dict) -> None:
108        """
109        The set_config function takes a config object and assigns it as the configuration object for the
110        class.
111
112        :param config: The `config` parameter in the `set_config` function is a dictionary object that
113        contains configuration settings for the class. When you call the `set_config` function with a
114        dictionary object as the argument, it will set that dictionary as the configuration object for
115        the class
116        :type config: dict
117        """
118
119        self.config = config

The set_config function takes a config object and assigns it as the configuration object for the class.

Parameters
  • config: The config parameter in the set_config function is a dictionary object that contains configuration settings for the class. When you call the set_config function with a dictionary object as the argument, it will set that dictionary as the configuration object for the class
def set_param(self, param: dict) -> None:
121    def set_param(self, param: dict) -> None:
122        """
123        This function sets a parameter object for the class based on the input dictionary.
124
125        :param param: The `set_param` method you provided takes a dictionary object as input and sets it
126        as the `param` attribute of the class instance
127        :type param: dict
128        """
129
130        self.param = param

This function sets a parameter object for the class based on the input dictionary.

Parameters
  • param: The set_param method you provided takes a dictionary object as input and sets it as the param attribute of the class instance
def init_variables(self) -> None:
132    def init_variables(self) -> None:
133        """
134        This function initializes the variables that will be used in the rest of the class
135        """
136
137        self.prefix = "howard"
138        self.table_variants = "variants"
139        self.dataframe = None
140
141        self.comparison_map = {
142            "gt": ">",
143            "gte": ">=",
144            "lt": "<",
145            "lte": "<=",
146            "equals": "=",
147            "contains": "SIMILAR TO",
148        }
149
150        self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3}
151
152        self.code_type_map_to_sql = {
153            "Integer": "INTEGER",
154            "String": "VARCHAR",
155            "Float": "FLOAT",
156            "Flag": "VARCHAR",
157        }
158
159        self.index_additionnal_fields = []

This function initializes the variables that will be used in the rest of the class

def get_indexing(self) -> bool:
161    def get_indexing(self) -> bool:
162        """
163        It returns the value of the key "indexing" in the dictionary. If the key is not present, it
164        returns False.
165        :return: The value of the indexing parameter.
166        """
167
168        return self.get_param().get("indexing", False)

It returns the value of the key "indexing" in the dictionary. If the key is not present, it returns False.

Returns

The value of the indexing parameter.

def get_connexion_config(self) -> dict:
170    def get_connexion_config(self) -> dict:
171        """
172        The function `get_connexion_config` returns a dictionary containing the configuration for a
173        connection, including the number of threads and memory limit.
174        :return: a dictionary containing the configuration for the Connexion library.
175        """
176
177        # config
178        config = self.get_config()
179
180        # Connexion config
181        connexion_config = {}
182        threads = self.get_threads()
183
184        # Threads
185        if threads:
186            connexion_config["threads"] = threads
187
188        # Memory
189        # if config.get("memory", None):
190        #     connexion_config["memory_limit"] = config.get("memory")
191        if self.get_memory():
192            connexion_config["memory_limit"] = self.get_memory()
193
194        # Temporary directory
195        if config.get("tmp", None):
196            connexion_config["temp_directory"] = config.get("tmp")
197
198        # Access
199        if config.get("access", None):
200            access = config.get("access")
201            if access in ["RO"]:
202                access = "READ_ONLY"
203            elif access in ["RW"]:
204                access = "READ_WRITE"
205            connexion_db = self.get_connexion_db()
206            if connexion_db in ":memory:":
207                access = "READ_WRITE"
208            connexion_config["access_mode"] = access
209
210        return connexion_config

The function get_connexion_config returns a dictionary containing the configuration for a connection, including the number of threads and memory limit.

Returns

a dictionary containing the configuration for the Connexion library.

def get_duckdb_settings(self) -> dict:
212    def get_duckdb_settings(self) -> dict:
213        """
214        The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a
215        string.
216        :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`.
217        """
218
219        # config
220        config = self.get_config()
221
222        # duckdb settings
223        duckdb_settings_dict = {}
224        if config.get("duckdb_settings", None):
225            duckdb_settings = config.get("duckdb_settings")
226            duckdb_settings = full_path(duckdb_settings)
227            # duckdb setting is a file
228            if os.path.exists(duckdb_settings):
229                with open(duckdb_settings) as json_file:
230                    duckdb_settings_dict = yaml.safe_load(json_file)
231            # duckdb settings is a string
232            else:
233                duckdb_settings_dict = json.loads(duckdb_settings)
234
235        return duckdb_settings_dict

The function get_duckdb_settings retrieves DuckDB settings from a configuration file or a string.

Returns

The function get_duckdb_settings returns a dictionary object duckdb_settings_dict.

def set_connexion_db(self) -> str:
237    def set_connexion_db(self) -> str:
238        """
239        The function `set_connexion_db` returns the appropriate database connection string based on the
240        input format and connection type.
241        :return: the value of the variable `connexion_db`.
242        """
243
244        # Default connexion db
245        default_connexion_db = ":memory:"
246
247        # Find connexion db
248        if self.get_input_format() in ["db", "duckdb"]:
249            connexion_db = self.get_input()
250        elif self.get_connexion_type() in ["memory", default_connexion_db, None]:
251            connexion_db = default_connexion_db
252        elif self.get_connexion_type() in ["tmpfile"]:
253            tmp_name = tempfile.mkdtemp(
254                prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db"
255            )
256            connexion_db = f"{tmp_name}/tmp.db"
257        elif self.get_connexion_type() != "":
258            connexion_db = self.get_connexion_type()
259        else:
260            connexion_db = default_connexion_db
261
262        # Set connexion db
263        self.connexion_db = connexion_db
264
265        return connexion_db

The function set_connexion_db returns the appropriate database connection string based on the input format and connection type.

Returns

the value of the variable connexion_db.

def set_connexion(self, conn) -> None:
267    def set_connexion(self, conn) -> None:
268        """
269        The function `set_connexion` creates a connection to a database, with options for different
270        database formats and settings.
271
272        :param conn: The `conn` parameter in the `set_connexion` method is the connection to the
273        database. If a connection is not provided, a new connection to an in-memory database is created.
274        The method then proceeds to set up the connection based on the specified format (e.g., duckdb or
275        sqlite
276        """
277
278        # Connexion db
279        connexion_db = self.set_connexion_db()
280
281        # Connexion config
282        connexion_config = self.get_connexion_config()
283
284        # Connexion format
285        connexion_format = self.get_config().get("connexion_format", "duckdb")
286        # Set connexion format
287        self.connexion_format = connexion_format
288
289        # Connexion
290        if not conn:
291            if connexion_format in ["duckdb"]:
292                conn = duckdb.connect(connexion_db, config=connexion_config)
293                # duckDB settings
294                duckdb_settings = self.get_duckdb_settings()
295                if duckdb_settings:
296                    for setting in duckdb_settings:
297                        setting_value = duckdb_settings.get(setting)
298                        if isinstance(setting_value, str):
299                            setting_value = f"'{setting_value}'"
300                        conn.execute(f"PRAGMA {setting}={setting_value};")
301            elif connexion_format in ["sqlite"]:
302                conn = sqlite3.connect(connexion_db)
303
304        # Set connexion
305        self.conn = conn
306
307        # Log
308        log.debug(f"connexion_format: {connexion_format}")
309        log.debug(f"connexion_db: {connexion_db}")
310        log.debug(f"connexion config: {connexion_config}")
311        log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}")

The function set_connexion creates a connection to a database, with options for different database formats and settings.

Parameters
  • conn: The conn parameter in the set_connexion method is the connection to the database. If a connection is not provided, a new connection to an in-memory database is created. The method then proceeds to set up the connection based on the specified format (e.g., duckdb or sqlite
def set_output(self, output: str = None) -> None:
313    def set_output(self, output: str = None) -> None:
314        """
315        The `set_output` function in Python sets the output file based on the input or a specified key
316        in the config file, extracting the output name, extension, and format.
317
318        :param output: The `output` parameter in the `set_output` method is used to specify the name of
319        the output file. If the config file has an 'output' key, the method sets the output to the value
320        of that key. If no output is provided, it sets the output to `None`
321        :type output: str
322        """
323
324        if output and not isinstance(output, str):
325            self.output = output.name
326        else:
327            self.output = output
328
329        # Output format
330        if self.output:
331            output_name, output_extension = os.path.splitext(self.output)
332            self.output_name = output_name
333            self.output_extension = output_extension
334            self.output_format = self.output_extension.replace(".", "")
335        else:
336            self.output_name = None
337            self.output_extension = None
338            self.output_format = None

The set_output function in Python sets the output file based on the input or a specified key in the config file, extracting the output name, extension, and format.

Parameters
  • output: The output parameter in the set_output method is used to specify the name of the output file. If the config file has an 'output' key, the method sets the output to the value of that key. If no output is provided, it sets the output to None
def set_header(self) -> None:
340    def set_header(self) -> None:
341        """
342        It reads the header of a VCF file and stores it as a list of strings and as a VCF object
343        """
344
345        input_file = self.get_input()
346        default_header_list = [
347            "##fileformat=VCFv4.2",
348            "#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO",
349        ]
350
351        # Full path
352        input_file = full_path(input_file)
353
354        if input_file:
355
356            input_format = self.get_input_format()
357            input_compressed = self.get_input_compressed()
358            config = self.get_config()
359            header_list = default_header_list
360            if input_format in [
361                "vcf",
362                "hdr",
363                "tsv",
364                "csv",
365                "psv",
366                "parquet",
367                "db",
368                "duckdb",
369            ]:
370                # header provided in param
371                if config.get("header_file", None):
372                    with open(config.get("header_file"), "rt") as f:
373                        header_list = self.read_vcf_header(f)
374                # within a vcf file format (header within input file itsself)
375                elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file):
376                    # within a compressed vcf file format (.vcf.gz)
377                    if input_compressed:
378                        with bgzf.open(input_file, "rt") as f:
379                            header_list = self.read_vcf_header(f)
380                    # within an uncompressed vcf file format (.vcf)
381                    else:
382                        with open(input_file, "rt") as f:
383                            header_list = self.read_vcf_header(f)
384                # header provided in default external file .hdr
385                elif os.path.exists((input_file + ".hdr")):
386                    with open(input_file + ".hdr", "rt") as f:
387                        header_list = self.read_vcf_header(f)
388                else:
389                    try:  # Try to get header info fields and file columns
390
391                        with tempfile.TemporaryDirectory() as tmpdir:
392
393                            # Create database
394                            db_for_header = Database(database=input_file)
395
396                            # Get header columns for infos fields
397                            db_header_from_columns = (
398                                db_for_header.get_header_from_columns()
399                            )
400
401                            # Get real columns in the file
402                            db_header_columns = db_for_header.get_columns()
403
404                            # Write header file
405                            header_file_tmp = os.path.join(tmpdir, "header")
406                            f = open(header_file_tmp, "w")
407                            vcf.Writer(f, db_header_from_columns)
408                            f.close()
409
410                            # Replace #CHROM line with rel columns
411                            header_list = db_for_header.read_header_file(
412                                header_file=header_file_tmp
413                            )
414                            header_list[-1] = "\t".join(db_header_columns)
415
416                    except:
417
418                        log.warning(
419                            f"No header for file {input_file}. Set as default VCF header"
420                        )
421                        header_list = default_header_list
422
423            else:  # try for unknown format ?
424
425                log.error(f"Input file format '{input_format}' not available")
426                raise ValueError(f"Input file format '{input_format}' not available")
427
428            if not header_list:
429                header_list = default_header_list
430
431            # header as list
432            self.header_list = header_list
433
434            # header as VCF object
435            self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list)))
436
437        else:
438
439            self.header_list = None
440            self.header_vcf = None

It reads the header of a VCF file and stores it as a list of strings and as a VCF object

def get_query_to_df(self, query: str = '', limit: int = None) -> pandas.core.frame.DataFrame:
442    def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame:
443        """
444        The `get_query_to_df` function takes a query as a string and returns the result as a pandas
445        DataFrame based on the connection format.
446
447        :param query: The `query` parameter in the `get_query_to_df` function is a string that
448        represents the SQL query you want to execute. This query will be used to fetch data from a
449        database and convert it into a pandas DataFrame
450        :type query: str
451        :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the
452        maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the
453        function will only fetch up to that number of rows from the database query result. If no limit
454        is specified,
455        :type limit: int
456        :return: A pandas DataFrame is being returned by the `get_query_to_df` function.
457        """
458
459        # Connexion format
460        connexion_format = self.get_connexion_format()
461
462        # Limit in query
463        if limit:
464            pd.set_option("display.max_rows", limit)
465            if connexion_format in ["duckdb"]:
466                df = (
467                    self.conn.execute(query)
468                    .fetch_record_batch(limit)
469                    .read_next_batch()
470                    .to_pandas()
471                )
472            elif connexion_format in ["sqlite"]:
473                df = next(pd.read_sql_query(query, self.conn, chunksize=limit))
474
475        # Full query
476        else:
477            if connexion_format in ["duckdb"]:
478                df = self.conn.execute(query).df()
479            elif connexion_format in ["sqlite"]:
480                df = pd.read_sql_query(query, self.conn)
481
482        return df

The get_query_to_df function takes a query as a string and returns the result as a pandas DataFrame based on the connection format.

Parameters
  • query: The query parameter in the get_query_to_df function is a string that represents the SQL query you want to execute. This query will be used to fetch data from a database and convert it into a pandas DataFrame
  • limit: The limit parameter in the get_query_to_df function is used to specify the maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the function will only fetch up to that number of rows from the database query result. If no limit is specified,
Returns

A pandas DataFrame is being returned by the get_query_to_df function.

def get_overview(self) -> None:
484    def get_overview(self) -> None:
485        """
486        The function prints the input, output, config, and dataframe of the current object
487        """
488        table_variants_from = self.get_table_variants(clause="from")
489        sql_columns = self.get_header_columns_as_sql()
490        sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}"
491        df = self.get_query_to_df(sql_query_export)
492        log.info(
493            "Input:  "
494            + str(self.get_input())
495            + " ["
496            + str(str(self.get_input_format()))
497            + "]"
498        )
499        log.info(
500            "Output: "
501            + str(self.get_output())
502            + " ["
503            + str(str(self.get_output_format()))
504            + "]"
505        )
506        log.info("Config: ")
507        for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split(
508            "\n"
509        ):
510            log.info("\t" + str(d))
511        log.info("Param: ")
512        for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split(
513            "\n"
514        ):
515            log.info("\t" + str(d))
516        log.info("Sample list: " + str(self.get_header_sample_list()))
517        log.info("Dataframe: ")
518        for d in str(df).split("\n"):
519            log.info("\t" + str(d))
520
521        # garbage collector
522        del df
523        gc.collect()
524
525        return None

The function prints the input, output, config, and dataframe of the current object

def get_stats(self) -> dict:
527    def get_stats(self) -> dict:
528        """
529        The `get_stats` function calculates and returns various statistics of the current object,
530        including information about the input file, variants, samples, header fields, quality, and
531        SNVs/InDels.
532        :return: a dictionary containing various statistics of the current object. The dictionary has
533        the following structure:
534        """
535
536        # Log
537        log.info(f"Stats Calculation...")
538
539        # table varaints
540        table_variants_from = self.get_table_variants()
541
542        # stats dict
543        stats = {"Infos": {}}
544
545        ### File
546        input_file = self.get_input()
547        stats["Infos"]["Input file"] = input_file
548
549        # Header
550        header_infos = self.get_header().infos
551        header_formats = self.get_header().formats
552        header_infos_list = list(header_infos)
553        header_formats_list = list(header_formats)
554
555        ### Variants
556
557        stats["Variants"] = {}
558
559        # Variants by chr
560        sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"'
561        df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom)
562        nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values(
563            by=["CHROM"], kind="quicksort"
564        )
565
566        # Total number of variants
567        nb_of_variants = nb_of_variants_by_chrom["count"].sum()
568
569        # Calculate percentage
570        nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply(
571            lambda x: (x / nb_of_variants)
572        )
573
574        stats["Variants"]["Number of variants by chromosome"] = (
575            nb_of_variants_by_chrom.to_dict(orient="index")
576        )
577
578        stats["Infos"]["Number of variants"] = int(nb_of_variants)
579
580        ### Samples
581
582        # Init
583        samples = {}
584        nb_of_samples = 0
585
586        # Check Samples
587        if "GT" in header_formats_list and "FORMAT" in self.get_header_columns():
588            log.debug(f"Check samples...")
589            for sample in self.get_header_sample_list():
590                sql_query_samples = f"""
591                    SELECT  '{sample}' as sample,
592                            REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype,
593                            count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count,
594                            concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage
595                    FROM {table_variants_from}
596                    WHERE (
597                        regexp_matches("{sample}", '^[0-9]([/|][0-9])+')
598                        AND
599                        len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':'))
600                      )
601                    GROUP BY genotype
602                    """
603                sql_query_genotype_df = self.conn.execute(sql_query_samples).df()
604                sample_genotype_count = sql_query_genotype_df["count"].sum()
605                if len(sql_query_genotype_df):
606                    nb_of_samples += 1
607                    samples[f"{sample} - {sample_genotype_count} variants"] = (
608                        sql_query_genotype_df.to_dict(orient="index")
609                    )
610
611            stats["Samples"] = samples
612            stats["Infos"]["Number of samples"] = nb_of_samples
613
614        # #
615        # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list:
616        #     stats["Infos"]["Number of samples"] = nb_of_samples
617        # elif nb_of_samples:
618        #     stats["Infos"]["Number of samples"] = "not a VCF format"
619
620        ### INFO and FORMAT fields
621        header_types_df = {}
622        header_types_list = {
623            "List of INFO fields": header_infos,
624            "List of FORMAT fields": header_formats,
625        }
626        i = 0
627        for header_type in header_types_list:
628
629            header_type_infos = header_types_list.get(header_type)
630            header_infos_dict = {}
631
632            for info in header_type_infos:
633
634                i += 1
635                header_infos_dict[i] = {}
636
637                # ID
638                header_infos_dict[i]["id"] = info
639
640                # num
641                genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"}
642                if header_type_infos[info].num in genotype_map.keys():
643                    header_infos_dict[i]["Number"] = genotype_map.get(
644                        header_type_infos[info].num
645                    )
646                else:
647                    header_infos_dict[i]["Number"] = header_type_infos[info].num
648
649                # type
650                if header_type_infos[info].type:
651                    header_infos_dict[i]["Type"] = header_type_infos[info].type
652                else:
653                    header_infos_dict[i]["Type"] = "."
654
655                # desc
656                if header_type_infos[info].desc != None:
657                    header_infos_dict[i]["Description"] = header_type_infos[info].desc
658                else:
659                    header_infos_dict[i]["Description"] = ""
660
661            if len(header_infos_dict):
662                header_types_df[header_type] = pd.DataFrame.from_dict(
663                    header_infos_dict, orient="index"
664                ).to_dict(orient="index")
665
666        # Stats
667        stats["Infos"]["Number of INFO fields"] = len(header_infos_list)
668        stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list)
669        stats["Header"] = header_types_df
670
671        ### QUAL
672        if "QUAL" in self.get_header_columns():
673            sql_query_qual = f"""
674                    SELECT
675                        avg(CAST(QUAL AS INTEGER)) AS Average,
676                        min(CAST(QUAL AS INTEGER)) AS Minimum,
677                        max(CAST(QUAL AS INTEGER)) AS Maximum,
678                        stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation,
679                        median(CAST(QUAL AS INTEGER)) AS Median,
680                        variance(CAST(QUAL AS INTEGER)) AS Variance
681                    FROM {table_variants_from}
682                    WHERE CAST(QUAL AS VARCHAR) NOT IN ('.')
683                    """
684
685            qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index")
686            stats["Quality"] = {"Stats": qual}
687
688        ### SNV and InDel
689
690        sql_query_snv = f"""
691            
692            SELECT Type, count FROM (
693
694                    SELECT
695                        'Total' AS Type,
696                        count(*) AS count
697                    FROM {table_variants_from}
698
699                    UNION
700
701                    SELECT
702                        'MNV' AS Type,
703                        count(*) AS count
704                    FROM {table_variants_from}
705                    WHERE len(REF) > 1 AND len(ALT) > 1
706                    AND len(REF) = len(ALT)
707
708                    UNION
709
710                    SELECT
711                        'InDel' AS Type,
712                        count(*) AS count
713                    FROM {table_variants_from}
714                    WHERE len(REF) > 1 OR len(ALT) > 1
715                    AND len(REF) != len(ALT)
716                    
717                    UNION
718
719                    SELECT
720                        'SNV' AS Type,
721                        count(*) AS count
722                    FROM {table_variants_from}
723                    WHERE len(REF) = 1 AND len(ALT) = 1
724
725                )
726
727            ORDER BY count DESC
728
729                """
730        snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index")
731
732        sql_query_snv_substitution = f"""
733                SELECT
734                    concat(REF, '>', ALT) AS 'Substitution',
735                    count(*) AS count
736                FROM {table_variants_from}
737                WHERE len(REF) = 1 AND len(ALT) = 1
738                GROUP BY REF, ALT
739                ORDER BY count(*) DESC
740                """
741        snv_substitution = (
742            self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index")
743        )
744        stats["Variants"]["Counts"] = snv_indel
745        stats["Variants"]["Substitutions"] = snv_substitution
746
747        return stats

The get_stats function calculates and returns various statistics of the current object, including information about the input file, variants, samples, header fields, quality, and SNVs/InDels.

Returns

a dictionary containing various statistics of the current object. The dictionary has the following structure:

def stats_to_file(self, file: str = None) -> str:
749    def stats_to_file(self, file: str = None) -> str:
750        """
751        The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them
752        into a JSON object, and writes the JSON object to the specified file.
753
754        :param file: The `file` parameter is a string that represents the file path where the JSON data
755        will be written
756        :type file: str
757        :return: the name of the file that was written to.
758        """
759
760        # Get stats
761        stats = self.get_stats()
762
763        # Serializing json
764        json_object = json.dumps(stats, indent=4)
765
766        # Writing to sample.json
767        with open(file, "w") as outfile:
768            outfile.write(json_object)
769
770        return file

The function stats_to_file takes a file name as input, retrieves statistics, serializes them into a JSON object, and writes the JSON object to the specified file.

Parameters
  • file: The file parameter is a string that represents the file path where the JSON data will be written
Returns

the name of the file that was written to.

def print_stats(self, output_file: str = None, json_file: str = None) -> None:
772    def print_stats(self, output_file: str = None, json_file: str = None) -> None:
773        """
774        The `print_stats` function generates a markdown file and prints the statistics contained in a
775        JSON file in a formatted manner.
776
777        :param output_file: The `output_file` parameter is a string that specifies the path and filename
778        of the output file where the stats will be printed in Markdown format. If no `output_file` is
779        provided, a temporary directory will be created and the stats will be saved in a file named
780        "stats.md" within that
781        :type output_file: str
782        :param json_file: The `json_file` parameter is a string that represents the path to the JSON
783        file where the statistics will be saved. If no value is provided, a temporary directory will be
784        created and a default file name "stats.json" will be used
785        :type json_file: str
786        :return: The function `print_stats` does not return any value. It has a return type annotation
787        of `None`.
788        """
789
790        # Full path
791        output_file = full_path(output_file)
792        json_file = full_path(json_file)
793
794        with tempfile.TemporaryDirectory() as tmpdir:
795
796            # Files
797            if not output_file:
798                output_file = os.path.join(tmpdir, "stats.md")
799            if not json_file:
800                json_file = os.path.join(tmpdir, "stats.json")
801
802            # Create folders
803            if not os.path.exists(os.path.dirname(output_file)):
804                Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True)
805            if not os.path.exists(os.path.dirname(json_file)):
806                Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True)
807
808            # Create stats JSON file
809            stats_file = self.stats_to_file(file=json_file)
810
811            # Print stats file
812            with open(stats_file) as f:
813                stats = yaml.safe_load(f)
814
815            # Output
816            output_title = []
817            output_index = []
818            output = []
819
820            # Title
821            output_title.append("# HOWARD Stats")
822
823            # Index
824            output_index.append("## Index")
825
826            # Process sections
827            for section in stats:
828                infos = stats.get(section)
829                section_link = "#" + section.lower().replace(" ", "-")
830                output.append(f"## {section}")
831                output_index.append(f"- [{section}]({section_link})")
832
833                if len(infos):
834                    for info in infos:
835                        try:
836                            df = pd.DataFrame.from_dict(infos.get(info), orient="index")
837                            is_df = True
838                        except:
839                            try:
840                                df = pd.DataFrame.from_dict(
841                                    json.loads((infos.get(info))), orient="index"
842                                )
843                                is_df = True
844                            except:
845                                is_df = False
846                        if is_df:
847                            output.append(f"### {info}")
848                            info_link = "#" + info.lower().replace(" ", "-")
849                            output_index.append(f"   - [{info}]({info_link})")
850                            output.append(f"{df.to_markdown(index=False)}")
851                        else:
852                            output.append(f"- {info}: {infos.get(info)}")
853                else:
854                    output.append(f"NA")
855
856            # Write stats in markdown file
857            with open(output_file, "w") as fp:
858                for item in output_title:
859                    fp.write("%s\n" % item)
860                for item in output_index:
861                    fp.write("%s\n" % item)
862                for item in output:
863                    fp.write("%s\n" % item)
864
865            # Output stats in markdown
866            print("")
867            print("\n\n".join(output_title))
868            print("")
869            print("\n\n".join(output))
870            print("")
871
872        return None

The print_stats function generates a markdown file and prints the statistics contained in a JSON file in a formatted manner.

Parameters
  • output_file: The output_file parameter is a string that specifies the path and filename of the output file where the stats will be printed in Markdown format. If no output_file is provided, a temporary directory will be created and the stats will be saved in a file named "stats.md" within that
  • json_file: The json_file parameter is a string that represents the path to the JSON file where the statistics will be saved. If no value is provided, a temporary directory will be created and a default file name "stats.json" will be used
Returns

The function print_stats does not return any value. It has a return type annotation of None.

def get_input(self) -> str:
874    def get_input(self) -> str:
875        """
876        It returns the value of the input variable.
877        :return: The input is being returned.
878        """
879        return self.input

It returns the value of the input variable.

Returns

The input is being returned.

def get_input_format(self, input_file: str = None) -> str:
881    def get_input_format(self, input_file: str = None) -> str:
882        """
883        This function returns the format of the input variable, either from the provided input file or
884        by prompting for input.
885
886        :param input_file: The `input_file` parameter in the `get_input_format` method is a string that
887        represents the file path of the input file. If no `input_file` is provided when calling the
888        method, it will default to `None`
889        :type input_file: str
890        :return: The format of the input variable is being returned.
891        """
892
893        if not input_file:
894            input_file = self.get_input()
895        input_format = get_file_format(input_file)
896        return input_format

This function returns the format of the input variable, either from the provided input file or by prompting for input.

Parameters
  • input_file: The input_file parameter in the get_input_format method is a string that represents the file path of the input file. If no input_file is provided when calling the method, it will default to None
Returns

The format of the input variable is being returned.

def get_input_compressed(self, input_file: str = None) -> str:
898    def get_input_compressed(self, input_file: str = None) -> str:
899        """
900        The function `get_input_compressed` returns the format of the input variable after compressing
901        it.
902
903        :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string
904        that represents the file path of the input file. If no `input_file` is provided when calling the
905        method, it will default to `None` and the method will then call `self.get_input()` to
906        :type input_file: str
907        :return: The function `get_input_compressed` returns the compressed format of the input
908        variable.
909        """
910
911        if not input_file:
912            input_file = self.get_input()
913        input_compressed = get_file_compressed(input_file)
914        return input_compressed

The function get_input_compressed returns the format of the input variable after compressing it.

Parameters
  • input_file: The input_file parameter in the get_input_compressed method is a string that represents the file path of the input file. If no input_file is provided when calling the method, it will default to None and the method will then call self.get_input() to
Returns

The function get_input_compressed returns the compressed format of the input variable.

def get_output(self) -> str:
916    def get_output(self) -> str:
917        """
918        It returns the output of the neuron.
919        :return: The output of the neural network.
920        """
921
922        return self.output

It returns the output of the neuron.

Returns

The output of the neural network.

def get_output_format(self, output_file: str = None) -> str:
924    def get_output_format(self, output_file: str = None) -> str:
925        """
926        The function `get_output_format` returns the format of the input variable or the output file if
927        provided.
928
929        :param output_file: The `output_file` parameter in the `get_output_format` method is a string
930        that represents the file path of the output file. If no `output_file` is provided when calling
931        the method, it will default to the output obtained from the `get_output` method of the class
932        instance. The
933        :type output_file: str
934        :return: The format of the input variable is being returned.
935        """
936
937        if not output_file:
938            output_file = self.get_output()
939        output_format = get_file_format(output_file)
940
941        return output_format

The function get_output_format returns the format of the input variable or the output file if provided.

Parameters
  • output_file: The output_file parameter in the get_output_format method is a string that represents the file path of the output file. If no output_file is provided when calling the method, it will default to the output obtained from the get_output method of the class instance. The
Returns

The format of the input variable is being returned.

def get_config(self) -> dict:
943    def get_config(self) -> dict:
944        """
945        It returns the config
946        :return: The config variable is being returned.
947        """
948        return self.config

It returns the config

Returns

The config variable is being returned.

def get_param(self) -> dict:
950    def get_param(self) -> dict:
951        """
952        It returns the param
953        :return: The param variable is being returned.
954        """
955        return self.param

It returns the param

Returns

The param variable is being returned.

def get_connexion_db(self) -> str:
957    def get_connexion_db(self) -> str:
958        """
959        It returns the connexion_db attribute of the object
960        :return: The connexion_db is being returned.
961        """
962        return self.connexion_db

It returns the connexion_db attribute of the object

Returns

The connexion_db is being returned.

def get_prefix(self) -> str:
964    def get_prefix(self) -> str:
965        """
966        It returns the prefix of the object.
967        :return: The prefix is being returned.
968        """
969        return self.prefix

It returns the prefix of the object.

Returns

The prefix is being returned.

def get_table_variants(self, clause: str = 'select') -> str:
971    def get_table_variants(self, clause: str = "select") -> str:
972        """
973        This function returns the table_variants attribute of the object
974
975        :param clause: the type of clause the table will be used. Either "select" or "from" (optional),
976        defaults to select (optional)
977        :return: The table_variants attribute of the object.
978        """
979
980        # Access
981        access = self.get_config().get("access", None)
982
983        # Clauses "select", "where", "update"
984        if clause in ["select", "where", "update"]:
985            table_variants = self.table_variants
986        # Clause "from"
987        elif clause in ["from"]:
988            # For Read Only
989            if self.get_input_format() in ["parquet"] and access in ["RO"]:
990                input_file = self.get_input()
991                table_variants = f"'{input_file}' as variants"
992            # For Read Write
993            else:
994                table_variants = f"{self.table_variants} as variants"
995        else:
996            table_variants = self.table_variants
997        return table_variants

This function returns the table_variants attribute of the object

Parameters
  • clause: the type of clause the table will be used. Either "select" or "from" (optional), defaults to select (optional)
Returns

The table_variants attribute of the object.

def get_tmp_dir(self) -> str:
 999    def get_tmp_dir(self) -> str:
1000        """
1001        The function `get_tmp_dir` returns the temporary directory path based on configuration
1002        parameters or a default path.
1003        :return: The `get_tmp_dir` method is returning the temporary directory path based on the
1004        configuration, parameters, and a default value of "/tmp".
1005        """
1006
1007        return get_tmp(
1008            config=self.get_config(), param=self.get_param(), default_tmp="/tmp"
1009        )

The function get_tmp_dir returns the temporary directory path based on configuration parameters or a default path.

Returns

The get_tmp_dir method is returning the temporary directory path based on the configuration, parameters, and a default value of "/tmp".

def get_connexion_type(self) -> str:
1011    def get_connexion_type(self) -> str:
1012        """
1013        If the connexion type is not in the list of allowed connexion types, raise a ValueError
1014
1015        :return: The connexion type is being returned.
1016        """
1017        return self.get_config().get("connexion_type", "memory")

If the connexion type is not in the list of allowed connexion types, raise a ValueError

Returns

The connexion type is being returned.

def get_connexion(self):
1019    def get_connexion(self):
1020        """
1021        It returns the connection object
1022
1023        :return: The connection object.
1024        """
1025        return self.conn

It returns the connection object

Returns

The connection object.

def close_connexion(self) -> None:
1027    def close_connexion(self) -> None:
1028        """
1029        This function closes the connection to the database.
1030        :return: The connection is being closed.
1031        """
1032        return self.conn.close()

This function closes the connection to the database.

Returns

The connection is being closed.

def get_header(self, type: str = 'vcf'):
1034    def get_header(self, type: str = "vcf"):
1035        """
1036        This function returns the header of the VCF file as a list of strings
1037
1038        :param type: the type of header you want to get, defaults to vcf (optional)
1039        :return: The header of the vcf file.
1040        """
1041
1042        if self.header_vcf:
1043            if type == "vcf":
1044                return self.header_vcf
1045            elif type == "list":
1046                return self.header_list
1047        else:
1048            if type == "vcf":
1049                header = vcf.Reader(io.StringIO("\n".join(vcf_required)))
1050                return header
1051            elif type == "list":
1052                return vcf_required

This function returns the header of the VCF file as a list of strings

Parameters
  • type: the type of header you want to get, defaults to vcf (optional)
Returns

The header of the vcf file.

def get_header_length(self, file: str = None) -> int:
1054    def get_header_length(self, file: str = None) -> int:
1055        """
1056        The function `get_header_length` returns the length of the header list, excluding the #CHROM
1057        line.
1058
1059        :param file: The `file` parameter is an optional argument that specifies the path to a VCF
1060        header file. If this argument is provided, the function will read the header from the specified
1061        file and return the length of the header list minus 1 (to exclude the #CHROM line)
1062        :type file: str
1063        :return: the length of the header list, excluding the #CHROM line.
1064        """
1065
1066        if file:
1067            return len(self.read_vcf_header_file(file=file)) - 1
1068        elif self.get_header(type="list"):
1069            return len(self.get_header(type="list")) - 1
1070        else:
1071            return 0

The function get_header_length returns the length of the header list, excluding the #CHROM line.

Parameters
  • file: The file parameter is an optional argument that specifies the path to a VCF header file. If this argument is provided, the function will read the header from the specified file and return the length of the header list minus 1 (to exclude the #CHROM line)
Returns

the length of the header list, excluding the #CHROM line.

def get_header_columns(self) -> str:
1073    def get_header_columns(self) -> str:
1074        """
1075        This function returns the header list of a VCF
1076
1077        :return: The length of the header list.
1078        """
1079        if self.get_header():
1080            return self.get_header(type="list")[-1]
1081        else:
1082            return ""

This function returns the header list of a VCF

Returns

The length of the header list.

def get_header_columns_as_list(self) -> list:
1084    def get_header_columns_as_list(self) -> list:
1085        """
1086        This function returns the header list of a VCF
1087
1088        :return: The length of the header list.
1089        """
1090        if self.get_header():
1091            return self.get_header_columns().strip().split("\t")
1092        else:
1093            return []

This function returns the header list of a VCF

Returns

The length of the header list.

def get_header_columns_as_sql(self) -> str:
1095    def get_header_columns_as_sql(self) -> str:
1096        """
1097        This function retruns header length (without #CHROM line)
1098
1099        :return: The length of the header list.
1100        """
1101        sql_column_list = []
1102        for col in self.get_header_columns_as_list():
1103            sql_column_list.append(f'"{col}"')
1104        return ",".join(sql_column_list)

This function retruns header length (without #CHROM line)

Returns

The length of the header list.

def get_header_sample_list(self) -> list:
1106    def get_header_sample_list(self) -> list:
1107        """
1108        This function retruns header length (without #CHROM line)
1109
1110        :return: The length of the header list.
1111        """
1112        return self.header_vcf.samples

This function retruns header length (without #CHROM line)

Returns

The length of the header list.

def get_verbose(self) -> bool:
1114    def get_verbose(self) -> bool:
1115        """
1116        It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't
1117        exist
1118
1119        :return: The value of the key "verbose" in the config dictionary.
1120        """
1121        return self.get_config().get("verbose", False)

It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't exist

Returns

The value of the key "verbose" in the config dictionary.

def get_connexion_format(self) -> str:
1123    def get_connexion_format(self) -> str:
1124        """
1125        It returns the connexion format of the object.
1126        :return: The connexion_format is being returned.
1127        """
1128        connexion_format = self.connexion_format
1129        if connexion_format not in ["duckdb", "sqlite"]:
1130            log.error(f"Unknown connexion format {connexion_format}")
1131            raise ValueError(f"Unknown connexion format {connexion_format}")
1132        else:
1133            return connexion_format

It returns the connexion format of the object.

Returns

The connexion_format is being returned.

def insert_file_to_table( self, file, columns: str, header_len: int = 0, sep: str = '\t', chunksize: int = 1000000) -> None:
1135    def insert_file_to_table(
1136        self,
1137        file,
1138        columns: str,
1139        header_len: int = 0,
1140        sep: str = "\t",
1141        chunksize: int = 1000000,
1142    ) -> None:
1143        """
1144        The function reads a file in chunks and inserts each chunk into a table based on the specified
1145        database format.
1146
1147        :param file: The `file` parameter is the file that you want to load into a table. It should be
1148        the path to the file on your system
1149        :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that
1150        should contain the names of the columns in the table where the data will be inserted. The column
1151        names should be separated by commas within the string. For example, if you have columns named
1152        "id", "name
1153        :type columns: str
1154        :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies
1155        the number of lines to skip at the beginning of the file before reading the actual data. This
1156        parameter allows you to skip any header information present in the file before processing the
1157        data, defaults to 0
1158        :type header_len: int (optional)
1159        :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the
1160        separator character that is used in the file being read. In this case, the default separator is
1161        set to `\t`, which represents a tab character. You can change this parameter to a different
1162        separator character if, defaults to \t
1163        :type sep: str (optional)
1164        :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time
1165        when processing the file in chunks. In the provided code snippet, the default value for
1166        `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults
1167        to 1000000
1168        :type chunksize: int (optional)
1169        """
1170
1171        # Config
1172        chunksize = self.get_config().get("load", {}).get("chunk", chunksize)
1173        connexion_format = self.get_connexion_format()
1174
1175        log.debug("chunksize: " + str(chunksize))
1176
1177        if chunksize:
1178            for chunk in pd.read_csv(
1179                file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c"
1180            ):
1181                if connexion_format in ["duckdb"]:
1182                    sql_insert_into = (
1183                        f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk"
1184                    )
1185                    self.conn.execute(sql_insert_into)
1186                elif connexion_format in ["sqlite"]:
1187                    chunk.to_sql("variants", self.conn, if_exists="append", index=False)

The function reads a file in chunks and inserts each chunk into a table based on the specified database format.

Parameters
  • file: The file parameter is the file that you want to load into a table. It should be the path to the file on your system
  • columns: The columns parameter in the insert_file_to_table function is a string that should contain the names of the columns in the table where the data will be inserted. The column names should be separated by commas within the string. For example, if you have columns named "id", "name
  • header_len: The header_len parameter in the insert_file_to_table function specifies the number of lines to skip at the beginning of the file before reading the actual data. This parameter allows you to skip any header information present in the file before processing the data, defaults to 0
  • sep: The sep parameter in the insert_file_to_table function is used to specify the separator character that is used in the file being read. In this case, the default separator is set to , which represents a tab character. You can change this parameter to a different separator character if, defaults to
  • chunksize: The chunksize parameter specifies the number of rows to read in at a time when processing the file in chunks. In the provided code snippet, the default value for chunksize is set to 1000000. This means that the file will be read in chunks of 1,, defaults to 1000000
def load_data( self, input_file: str = None, drop_variants_table: bool = False, sample_size: int = 20480) -> None:
1189    def load_data(
1190        self,
1191        input_file: str = None,
1192        drop_variants_table: bool = False,
1193        sample_size: int = 20480,
1194    ) -> None:
1195        """
1196        The `load_data` function reads a VCF file and inserts it into a table, with options to drop the
1197        table before loading the data and specify a sample size.
1198
1199        :param input_file: The path to the input file. This is the VCF file that will be loaded into the
1200        table
1201        :type input_file: str
1202        :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that
1203        determines whether the variants table should be dropped before loading the data. If set to
1204        `True`, the variants table will be dropped. If set to `False` (default), the variants table will
1205        not be dropped, defaults to False
1206        :type drop_variants_table: bool (optional)
1207        :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from
1208        the input file. If it is set to `None`, the default value of 20480 will be used, defaults to
1209        20480
1210        :type sample_size: int (optional)
1211        """
1212
1213        log.info("Loading...")
1214
1215        # change input file
1216        if input_file:
1217            self.set_input(input_file)
1218            self.set_header()
1219
1220        # drop variants table
1221        if drop_variants_table:
1222            self.drop_variants_table()
1223
1224        # get table variants
1225        table_variants = self.get_table_variants()
1226
1227        # Access
1228        access = self.get_config().get("access", None)
1229        log.debug(f"access: {access}")
1230
1231        # Input format and compress
1232        input_format = self.get_input_format()
1233        input_compressed = self.get_input_compressed()
1234        log.debug(f"input_format: {input_format}")
1235        log.debug(f"input_compressed: {input_compressed}")
1236
1237        # input_compressed_format
1238        if input_compressed:
1239            input_compressed_format = "gzip"
1240        else:
1241            input_compressed_format = "none"
1242        log.debug(f"input_compressed_format: {input_compressed_format}")
1243
1244        # Connexion format
1245        connexion_format = self.get_connexion_format()
1246
1247        # Sample size
1248        if not sample_size:
1249            sample_size = -1
1250        log.debug(f"sample_size: {sample_size}")
1251
1252        # Load data
1253        log.debug(f"Load Data from {input_format}")
1254
1255        # DuckDB connexion
1256        if connexion_format in ["duckdb"]:
1257
1258            # Database already exists
1259            if self.input_format in ["db", "duckdb"]:
1260
1261                if connexion_format in ["duckdb"]:
1262                    log.debug(f"Input file format '{self.input_format}' duckDB")
1263                else:
1264                    log.error(
1265                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
1266                    )
1267                    raise ValueError(
1268                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
1269                    )
1270
1271            # Load from existing database format
1272            else:
1273
1274                try:
1275                    # Create Table or View
1276                    database = Database(database=self.input)
1277                    sql_from = database.get_sql_from(sample_size=sample_size)
1278
1279                    if access in ["RO"]:
1280                        sql_load = (
1281                            f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}"
1282                        )
1283                    else:
1284                        sql_load = (
1285                            f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}"
1286                        )
1287                    self.conn.execute(sql_load)
1288
1289                except:
1290                    # Format not available
1291                    log.error(f"Input file format '{self.input_format}' not available")
1292                    raise ValueError(
1293                        f"Input file format '{self.input_format}' not available"
1294                    )
1295
1296        # SQLite connexion
1297        elif connexion_format in ["sqlite"] and input_format in [
1298            "vcf",
1299            "tsv",
1300            "csv",
1301            "psv",
1302        ]:
1303
1304            # Main structure
1305            structure = {
1306                "#CHROM": "VARCHAR",
1307                "POS": "INTEGER",
1308                "ID": "VARCHAR",
1309                "REF": "VARCHAR",
1310                "ALT": "VARCHAR",
1311                "QUAL": "VARCHAR",
1312                "FILTER": "VARCHAR",
1313                "INFO": "VARCHAR",
1314            }
1315
1316            # Strcuture with samples
1317            structure_complete = structure
1318            if self.get_header_sample_list():
1319                structure["FORMAT"] = "VARCHAR"
1320                for sample in self.get_header_sample_list():
1321                    structure_complete[sample] = "VARCHAR"
1322
1323            # Columns list for create and insert
1324            sql_create_table_columns = []
1325            sql_create_table_columns_list = []
1326            for column in structure_complete:
1327                column_type = structure_complete[column]
1328                sql_create_table_columns.append(
1329                    f'"{column}" {column_type} default NULL'
1330                )
1331                sql_create_table_columns_list.append(f'"{column}"')
1332
1333            # Create database
1334            log.debug(f"Create Table {table_variants}")
1335            sql_create_table_columns_sql = ", ".join(sql_create_table_columns)
1336            sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list)
1337            sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})"
1338            self.conn.execute(sql_create_table)
1339
1340            # chunksize define length of file chunk load file
1341            chunksize = 100000
1342
1343            # delimiter
1344            delimiter = file_format_delimiters.get(input_format, "\t")
1345
1346            # Load the input file
1347            with open(self.input, "rt") as input_file:
1348
1349                # Use the appropriate file handler based on the input format
1350                if input_compressed:
1351                    input_file = bgzf.open(self.input, "rt")
1352                if input_format in ["vcf"]:
1353                    header_len = self.get_header_length()
1354                else:
1355                    header_len = 0
1356
1357                # Insert the file contents into a table
1358                self.insert_file_to_table(
1359                    input_file,
1360                    columns=sql_create_table_columns_list_sql,
1361                    header_len=header_len,
1362                    sep=delimiter,
1363                    chunksize=chunksize,
1364                )
1365
1366        else:
1367            log.error(
1368                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
1369            )
1370            raise ValueError(
1371                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
1372            )
1373
1374        # Explode INFOS fields into table fields
1375        if self.get_explode_infos():
1376            self.explode_infos(
1377                prefix=self.get_explode_infos_prefix(),
1378                fields=self.get_explode_infos_fields(),
1379                force=True,
1380            )
1381
1382        # Create index after insertion
1383        self.create_indexes()

The load_data function reads a VCF file and inserts it into a table, with options to drop the table before loading the data and specify a sample size.

Parameters
  • input_file: The path to the input file. This is the VCF file that will be loaded into the table
  • drop_variants_table: The drop_variants_table parameter is a boolean flag that determines whether the variants table should be dropped before loading the data. If set to True, the variants table will be dropped. If set to False (default), the variants table will not be dropped, defaults to False
  • sample_size: The sample_size parameter determines the number of rows to be sampled from the input file. If it is set to None, the default value of 20480 will be used, defaults to 20480
def get_explode_infos(self) -> bool:
1385    def get_explode_infos(self) -> bool:
1386        """
1387        The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting
1388        to False if it is not set.
1389        :return: The method is returning the value of the "explode_infos" parameter, which is a boolean
1390        value. If the parameter is not present, it will return False.
1391        """
1392
1393        return self.get_param().get("explode", {}).get("explode_infos", False)

The function get_explode_infos returns the value of the "explode_infos" parameter, defaulting to False if it is not set.

Returns

The method is returning the value of the "explode_infos" parameter, which is a boolean value. If the parameter is not present, it will return False.

def get_explode_infos_fields( self, explode_infos_fields: str = None, remove_fields_not_in_header: bool = False) -> list:
1395    def get_explode_infos_fields(
1396        self,
1397        explode_infos_fields: str = None,
1398        remove_fields_not_in_header: bool = False,
1399    ) -> list:
1400        """
1401        The `get_explode_infos_fields` function returns a list of exploded information fields based on
1402        the input parameter `explode_infos_fields`.
1403
1404        :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the
1405        fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a
1406        comma-separated list of field names to explode
1407        :type explode_infos_fields: str
1408        :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean
1409        flag that determines whether to remove fields that are not present in the header. If it is set
1410        to `True`, any field that is not in the header will be excluded from the list of exploded
1411        information fields. If it is set to `, defaults to False
1412        :type remove_fields_not_in_header: bool (optional)
1413        :return: The function `get_explode_infos_fields` returns a list of exploded information fields.
1414        If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty
1415        list. If the parameter is provided and its value is "ALL", it also returns an empty list.
1416        Otherwise, it returns a list of exploded information fields after removing any spaces and
1417        splitting the string by commas.
1418        """
1419
1420        # If no fields, get it in param
1421        if not explode_infos_fields:
1422            explode_infos_fields = (
1423                self.get_param().get("explode", {}).get("explode_infos_fields", None)
1424            )
1425
1426        # If no fields, defined as all fields in header using keyword
1427        if not explode_infos_fields:
1428            explode_infos_fields = "*"
1429
1430        # If fields list not empty
1431        if explode_infos_fields:
1432
1433            # Input fields list
1434            if isinstance(explode_infos_fields, str):
1435                fields_input = explode_infos_fields.split(",")
1436            elif isinstance(explode_infos_fields, list):
1437                fields_input = explode_infos_fields
1438            else:
1439                fields_input = []
1440
1441            # Fields list without * keyword
1442            fields_without_all = fields_input.copy()
1443            if "*".casefold() in (item.casefold() for item in fields_without_all):
1444                fields_without_all.remove("*")
1445
1446            # Fields in header
1447            fields_in_header = sorted(list(set(self.get_header().infos)))
1448
1449            # Construct list of fields
1450            fields_output = []
1451            for field in fields_input:
1452
1453                # Strip field
1454                field = field.strip()
1455
1456                # format keyword * in regex
1457                if field.upper() in ["*"]:
1458                    field = ".*"
1459
1460                # Find all fields with pattern
1461                r = re.compile(field)
1462                fields_search = sorted(list(filter(r.match, fields_in_header)))
1463
1464                # Remove fields input from search
1465                if fields_search != [field]:
1466                    fields_search = sorted(
1467                        list(set(fields_search).difference(fields_input))
1468                    )
1469
1470                # If field is not in header (avoid not well formatted header)
1471                if not fields_search and not remove_fields_not_in_header:
1472                    fields_search = [field]
1473
1474                # Add found fields
1475                for new_field in fields_search:
1476                    # Add field, if not already exists, and if it is in header (if asked)
1477                    if (
1478                        new_field not in fields_output
1479                        and (
1480                            not remove_fields_not_in_header
1481                            or new_field in fields_in_header
1482                        )
1483                        and new_field not in [".*"]
1484                    ):
1485                        fields_output.append(new_field)
1486
1487            return fields_output
1488
1489        else:
1490
1491            return []

The get_explode_infos_fields function returns a list of exploded information fields based on the input parameter explode_infos_fields.

Parameters
  • explode_infos_fields: The explode_infos_fields parameter is a string that specifies the fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a comma-separated list of field names to explode
  • remove_fields_not_in_header: The parameter remove_fields_not_in_header is a boolean flag that determines whether to remove fields that are not present in the header. If it is set to True, any field that is not in the header will be excluded from the list of exploded information fields. If it is set to `, defaults to False
Returns

The function get_explode_infos_fields returns a list of exploded information fields. If the explode_infos_fields parameter is not provided or is set to None, it returns an empty list. If the parameter is provided and its value is "ALL", it also returns an empty list. Otherwise, it returns a list of exploded information fields after removing any spaces and splitting the string by commas.

def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str:
1493    def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str:
1494        """
1495        The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or
1496        the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is
1497        not provided.
1498
1499        :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a
1500        prefix to be used for exploding or expanding information
1501        :type explode_infos_prefix: str
1502        :return: the value of the variable `explode_infos_prefix`.
1503        """
1504
1505        if not explode_infos_prefix:
1506            explode_infos_prefix = (
1507                self.get_param().get("explode", {}).get("explode_infos_prefix", "")
1508            )
1509
1510        return explode_infos_prefix

The function get_explode_infos_prefix returns the value of the explode_infos_prefix parameter, or the value of self.get_param().get("explode_infos_prefix", None) if explode_infos_prefix is not provided.

Parameters
  • explode_infos_prefix: The parameter explode_infos_prefix is a string that specifies a prefix to be used for exploding or expanding information
Returns

the value of the variable explode_infos_prefix.

def add_column( self, table_name, column_name, column_type, default_value=None, drop: bool = False) -> dict:
1512    def add_column(
1513        self,
1514        table_name,
1515        column_name,
1516        column_type,
1517        default_value=None,
1518        drop: bool = False,
1519    ) -> dict:
1520        """
1521        The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it
1522        doesn't already exist.
1523
1524        :param table_name: The name of the table to which you want to add a column
1525        :param column_name: The parameter "column_name" is the name of the column that you want to add
1526        to the table
1527        :param column_type: The `column_type` parameter specifies the data type of the column that you
1528        want to add to the table. It should be a string that represents the desired data type, such as
1529        "INTEGER", "TEXT", "REAL", etc
1530        :param default_value: The `default_value` parameter is an optional parameter that specifies the
1531        default value for the newly added column. If a default value is provided, it will be assigned to
1532        the column for any existing rows that do not have a value for that column
1533        :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column
1534        if it already exists in the table. If `drop` is set to `True`, the function will drop the
1535        existing column before adding the new column. If `drop` is set to `False` (default),, defaults
1536        to False
1537        :type drop: bool (optional)
1538        :return: a boolean value indicating whether the column was successfully added to the table.
1539        """
1540
1541        # added
1542        added = False
1543        dropped = False
1544
1545        # Check if the column already exists in the table
1546        query = f""" SELECT * FROM {table_name} LIMIT 0 """
1547        columns = self.get_query_to_df(query).columns.tolist()
1548        if column_name in columns:
1549            log.debug(
1550                f"The {column_name} column already exists in the {table_name} table"
1551            )
1552            if drop:
1553                self.drop_column(table_name=table_name, column_name=column_name)
1554                dropped = True
1555            else:
1556                return None
1557        else:
1558            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
1559
1560        # Add column in table
1561        add_column_query = (
1562            f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """
1563        )
1564        if default_value is not None:
1565            add_column_query += f" DEFAULT {default_value}"
1566        self.execute_query(add_column_query)
1567        added = not dropped
1568        log.debug(
1569            f"The {column_name} column was successfully added to the {table_name} table"
1570        )
1571
1572        if added:
1573            added_column = {
1574                "table_name": table_name,
1575                "column_name": column_name,
1576                "column_type": column_type,
1577                "default_value": default_value,
1578            }
1579        else:
1580            added_column = None
1581
1582        return added_column

The add_column function adds a column to a SQLite or DuckDB table with a default value if it doesn't already exist.

Parameters
  • table_name: The name of the table to which you want to add a column
  • column_name: The parameter "column_name" is the name of the column that you want to add to the table
  • column_type: The column_type parameter specifies the data type of the column that you want to add to the table. It should be a string that represents the desired data type, such as "INTEGER", "TEXT", "REAL", etc
  • default_value: The default_value parameter is an optional parameter that specifies the default value for the newly added column. If a default value is provided, it will be assigned to the column for any existing rows that do not have a value for that column
  • drop: The drop parameter is a boolean flag that determines whether to drop the column if it already exists in the table. If drop is set to True, the function will drop the existing column before adding the new column. If drop is set to False (default),, defaults to False
Returns

a boolean value indicating whether the column was successfully added to the table.

def drop_column( self, column: dict = None, table_name: str = None, column_name: str = None) -> bool:
1584    def drop_column(
1585        self, column: dict = None, table_name: str = None, column_name: str = None
1586    ) -> bool:
1587        """
1588        The `drop_column` function drops a specified column from a given table in a database and returns
1589        True if the column was successfully dropped, and False if the column does not exist in the
1590        table.
1591
1592        :param column: The `column` parameter is a dictionary that contains information about the column
1593        you want to drop. It has two keys:
1594        :type column: dict
1595        :param table_name: The `table_name` parameter is the name of the table from which you want to
1596        drop a column
1597        :type table_name: str
1598        :param column_name: The `column_name` parameter is the name of the column that you want to drop
1599        from the table
1600        :type column_name: str
1601        :return: a boolean value. It returns True if the column was successfully dropped from the table,
1602        and False if the column does not exist in the table.
1603        """
1604
1605        # Find column infos
1606        if column:
1607            if isinstance(column, dict):
1608                table_name = column.get("table_name", None)
1609                column_name = column.get("column_name", None)
1610            elif isinstance(column, str):
1611                table_name = self.get_table_variants()
1612                column_name = column
1613            else:
1614                table_name = None
1615                column_name = None
1616
1617        if not table_name and not column_name:
1618            return False
1619
1620        # Removed
1621        removed = False
1622
1623        # Check if the column already exists in the table
1624        query = f""" SELECT * FROM {table_name} LIMIT 0 """
1625        columns = self.get_query_to_df(query).columns.tolist()
1626        if column_name in columns:
1627            log.debug(f"The {column_name} column exists in the {table_name} table")
1628        else:
1629            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
1630            return False
1631
1632        # Add column in table # ALTER TABLE integers DROP k
1633        add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """
1634        self.execute_query(add_column_query)
1635        removed = True
1636        log.debug(
1637            f"The {column_name} column was successfully dropped to the {table_name} table"
1638        )
1639
1640        return removed

The drop_column function drops a specified column from a given table in a database and returns True if the column was successfully dropped, and False if the column does not exist in the table.

Parameters
  • column: The column parameter is a dictionary that contains information about the column you want to drop. It has two keys:
  • table_name: The table_name parameter is the name of the table from which you want to drop a column
  • column_name: The column_name parameter is the name of the column that you want to drop from the table
Returns

a boolean value. It returns True if the column was successfully dropped from the table, and False if the column does not exist in the table.

def explode_infos( self, prefix: str = None, create_index: bool = False, fields: list = None, force: bool = False, proccess_all_fields_together: bool = False) -> list:
1642    def explode_infos(
1643        self,
1644        prefix: str = None,
1645        create_index: bool = False,
1646        fields: list = None,
1647        force: bool = False,
1648        proccess_all_fields_together: bool = False,
1649    ) -> list:
1650        """
1651        The `explode_infos` function takes a VCF file and explodes the INFO fields into individual
1652        columns, returning a list of added columns.
1653
1654        :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO
1655        fields. If the `prefix` is not provided or is set to `None`, the function will use the value of
1656        `self.get_explode_infos_prefix()` as the prefix
1657        :type prefix: str
1658        :param create_index: The `create_index` parameter is a boolean flag that specifies whether to
1659        create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to
1660        `False`, indexes will not be created. The default value is `False`, defaults to False
1661        :type create_index: bool (optional)
1662        :param fields: The `fields` parameter is a list of INFO fields that you want to explode into
1663        individual columns. If this parameter is not provided, all INFO fields will be exploded
1664        :type fields: list
1665        :param force: The `force` parameter is a boolean flag that determines whether to drop and
1666        recreate the column if it already exists in the table. If `force` is set to `True`, the column
1667        will be dropped and recreated. If `force` is set to `False`, the column will not be dropped,
1668        defaults to False
1669        :type force: bool (optional)
1670        :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean
1671        flag that determines whether to process all the INFO fields together or individually. If set to
1672        `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will
1673        be processed individually, defaults to False
1674        :type proccess_all_fields_together: bool (optional)
1675        :return: The function `explode_infos` returns a list of added columns.
1676        """
1677
1678        # drop indexes
1679        self.drop_indexes()
1680
1681        # connexion format
1682        connexion_format = self.get_connexion_format()
1683
1684        # Access
1685        access = self.get_config().get("access", None)
1686
1687        # Added columns
1688        added_columns = []
1689
1690        if access not in ["RO"]:
1691
1692            # prefix
1693            if prefix in [None, True] or not isinstance(prefix, str):
1694                if self.get_explode_infos_prefix() not in [None, True]:
1695                    prefix = self.get_explode_infos_prefix()
1696                else:
1697                    prefix = "INFO/"
1698
1699            # table variants
1700            table_variants = self.get_table_variants(clause="select")
1701
1702            # extra infos
1703            try:
1704                extra_infos = self.get_extra_infos()
1705            except:
1706                extra_infos = []
1707
1708            # Header infos
1709            header_infos = self.get_header().infos
1710
1711            log.debug(
1712                f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields"
1713            )
1714
1715            sql_info_alter_table_array = []
1716
1717            # Info fields to check
1718            fields_list = list(header_infos)
1719            if fields:
1720                fields_list += fields
1721            fields_list = set(fields_list)
1722
1723            # If no fields
1724            if not fields:
1725                fields = []
1726
1727            # Translate fields if patterns
1728            fields = self.get_explode_infos_fields(explode_infos_fields=fields)
1729
1730            for info in fields:
1731
1732                info_id_sql = prefix + info
1733
1734                if (
1735                    info in fields_list
1736                    or prefix + info in fields_list
1737                    or info in extra_infos
1738                ):
1739
1740                    log.debug(f"Explode INFO fields - ADD '{info}' annotations fields")
1741
1742                    if info in header_infos:
1743                        info_type = header_infos[info].type
1744                        info_num = header_infos[info].num
1745                    else:
1746                        info_type = "String"
1747                        info_num = 0
1748
1749                    type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR")
1750                    if info_num != 1:
1751                        type_sql = "VARCHAR"
1752
1753                    # Add field
1754                    added_column = self.add_column(
1755                        table_name=table_variants,
1756                        column_name=info_id_sql,
1757                        column_type=type_sql,
1758                        default_value="null",
1759                        drop=force,
1760                    )
1761
1762                    if added_column:
1763                        added_columns.append(added_column)
1764
1765                    if added_column or force:
1766
1767                        # add field to index
1768                        self.index_additionnal_fields.append(info_id_sql)
1769
1770                        # Update field array
1771                        if connexion_format in ["duckdb"]:
1772                            update_info_field = f"""
1773                            "{info_id_sql}" =
1774                                CASE
1775                                    WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL
1776                                    ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1)
1777                                END
1778                            """
1779                        elif connexion_format in ["sqlite"]:
1780                            update_info_field = f"""
1781                                "{info_id_sql}" =
1782                                    CASE
1783                                        WHEN instr(INFO, '{info}=') = 0 THEN NULL
1784                                        WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1)
1785                                        ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1)
1786                                    END
1787                            """
1788
1789                        sql_info_alter_table_array.append(update_info_field)
1790
1791            if sql_info_alter_table_array:
1792
1793                # By chromosomes
1794                try:
1795                    chromosomes_list = list(
1796                        self.get_query_to_df(
1797                            f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """
1798                        )["#CHROM"]
1799                    )
1800                except:
1801                    chromosomes_list = [None]
1802
1803                for chrom in chromosomes_list:
1804                    log.debug(f"Explode INFO fields - Chromosome {chrom}...")
1805
1806                    # Where clause
1807                    where_clause = ""
1808                    if chrom and len(chromosomes_list) > 1:
1809                        where_clause = f""" WHERE "#CHROM" = '{chrom}' """
1810
1811                    # Update table
1812                    if proccess_all_fields_together:
1813                        sql_info_alter_table_array_join = ", ".join(
1814                            sql_info_alter_table_array
1815                        )
1816                        if sql_info_alter_table_array_join:
1817                            sql_info_alter_table = f"""
1818                                UPDATE {table_variants}
1819                                SET {sql_info_alter_table_array_join}
1820                                {where_clause}
1821                                """
1822                            log.debug(
1823                                f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..."
1824                            )
1825                            # log.debug(sql_info_alter_table)
1826                            self.conn.execute(sql_info_alter_table)
1827                    else:
1828                        sql_info_alter_num = 0
1829                        for sql_info_alter in sql_info_alter_table_array:
1830                            sql_info_alter_num += 1
1831                            sql_info_alter_table = f"""
1832                                UPDATE {table_variants}
1833                                SET {sql_info_alter}
1834                                {where_clause}
1835                                """
1836                            log.debug(
1837                                f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..."
1838                            )
1839                            # log.debug(sql_info_alter_table)
1840                            self.conn.execute(sql_info_alter_table)
1841
1842        # create indexes
1843        if create_index:
1844            self.create_indexes()
1845
1846        return added_columns

The explode_infos function takes a VCF file and explodes the INFO fields into individual columns, returning a list of added columns.

Parameters
  • prefix: The prefix parameter is a string that is used as a prefix for the exploded INFO fields. If the prefix is not provided or is set to None, the function will use the value of self.get_explode_infos_prefix() as the prefix
  • create_index: The create_index parameter is a boolean flag that specifies whether to create indexes on the exploded INFO fields. If set to True, indexes will be created; if set to False, indexes will not be created. The default value is False, defaults to False
  • fields: The fields parameter is a list of INFO fields that you want to explode into individual columns. If this parameter is not provided, all INFO fields will be exploded
  • force: The force parameter is a boolean flag that determines whether to drop and recreate the column if it already exists in the table. If force is set to True, the column will be dropped and recreated. If force is set to False, the column will not be dropped, defaults to False
  • proccess_all_fields_together: The proccess_all_fields_together parameter is a boolean flag that determines whether to process all the INFO fields together or individually. If set to True, all the INFO fields will be processed together. If set to False, each INFO field will be processed individually, defaults to False
Returns

The function explode_infos returns a list of added columns.

def create_indexes(self) -> None:
1848    def create_indexes(self) -> None:
1849        """
1850        Create indexes on the table after insertion
1851        """
1852
1853        # Access
1854        access = self.get_config().get("access", None)
1855
1856        # get table variants
1857        table_variants = self.get_table_variants("FROM")
1858
1859        if self.get_indexing() and access not in ["RO"]:
1860            # Create index
1861            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")'
1862            self.conn.execute(sql_create_table_index)
1863            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")'
1864            self.conn.execute(sql_create_table_index)
1865            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")'
1866            self.conn.execute(sql_create_table_index)
1867            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")'
1868            self.conn.execute(sql_create_table_index)
1869            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")'
1870            self.conn.execute(sql_create_table_index)
1871            for field in self.index_additionnal_fields:
1872                sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """
1873                self.conn.execute(sql_create_table_index)

Create indexes on the table after insertion

def drop_indexes(self) -> None:
1875    def drop_indexes(self) -> None:
1876        """
1877        Create indexes on the table after insertion
1878        """
1879
1880        # Access
1881        access = self.get_config().get("access", None)
1882
1883        # get table variants
1884        table_variants = self.get_table_variants("FROM")
1885
1886        # Get database format
1887        connexion_format = self.get_connexion_format()
1888
1889        if access not in ["RO"]:
1890            if connexion_format in ["duckdb"]:
1891                sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'"
1892            elif connexion_format in ["sqlite"]:
1893                sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';"
1894
1895            list_indexes = self.conn.execute(sql_list_indexes)
1896            index_names = [row[0] for row in list_indexes.fetchall()]
1897            for index in index_names:
1898                sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """
1899                self.conn.execute(sql_drop_table_index)

Create indexes on the table after insertion

def read_vcf_header(self, f) -> list:
1901    def read_vcf_header(self, f) -> list:
1902        """
1903        It reads the header of a VCF file and returns a list of the header lines
1904
1905        :param f: the file object
1906        :return: The header lines of the VCF file.
1907        """
1908
1909        header_list = []
1910        for line in f:
1911            header_list.append(line)
1912            if line.startswith("#CHROM"):
1913                break
1914        return header_list

It reads the header of a VCF file and returns a list of the header lines

Parameters
  • f: the file object
Returns

The header lines of the VCF file.

def read_vcf_header_file(self, file: str = None) -> list:
1916    def read_vcf_header_file(self, file: str = None) -> list:
1917        """
1918        The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and
1919        uncompressed files.
1920
1921        :param file: The `file` parameter is a string that represents the path to the VCF header file
1922        that you want to read. It is an optional parameter, so if you don't provide a value, it will
1923        default to `None`
1924        :type file: str
1925        :return: The function `read_vcf_header_file` returns a list.
1926        """
1927
1928        if self.get_input_compressed(input_file=file):
1929            with bgzf.open(file, "rt") as f:
1930                return self.read_vcf_header(f=f)
1931        else:
1932            with open(file, "rt") as f:
1933                return self.read_vcf_header(f=f)

The read_vcf_header_file function reads the header of a VCF file, handling both compressed and uncompressed files.

Parameters
  • file: The file parameter is a string that represents the path to the VCF header file that you want to read. It is an optional parameter, so if you don't provide a value, it will default to None
Returns

The function read_vcf_header_file returns a list.

def execute_query(self, query: str):
1935    def execute_query(self, query: str):
1936        """
1937        It takes a query as an argument, executes it, and returns the results
1938
1939        :param query: The query to be executed
1940        :return: The result of the query is being returned.
1941        """
1942        if query:
1943            return self.conn.execute(query)  # .fetchall()
1944        else:
1945            return None

It takes a query as an argument, executes it, and returns the results

Parameters
  • query: The query to be executed
Returns

The result of the query is being returned.

def export_output( self, output_file: str | None = None, output_header: str | None = None, export_header: bool = True, query: str | None = None, parquet_partitions: list | None = None, chunk_size: int | None = None, threads: int | None = None, sort: bool = False, index: bool = False, order_by: str | None = None) -> bool:
1947    def export_output(
1948        self,
1949        output_file: str | None = None,
1950        output_header: str | None = None,
1951        export_header: bool = True,
1952        query: str | None = None,
1953        parquet_partitions: list | None = None,
1954        chunk_size: int | None = None,
1955        threads: int | None = None,
1956        sort: bool = False,
1957        index: bool = False,
1958        order_by: str | None = None,
1959    ) -> bool:
1960        """
1961        The `export_output` function exports data from a VCF file to a specified output file in various
1962        formats, including VCF, CSV, TSV, PSV, and Parquet.
1963
1964        :param output_file: The `output_file` parameter is a string that specifies the name of the
1965        output file to be generated by the function. This is where the exported data will be saved
1966        :type output_file: str
1967        :param output_header: The `output_header` parameter is a string that specifies the name of the
1968        file where the header of the VCF file will be exported. If this parameter is not provided, the
1969        header will be exported to a file with the same name as the `output_file` parameter, but with
1970        the extension "
1971        :type output_header: str
1972        :param export_header: The `export_header` parameter is a boolean flag that determines whether
1973        the header of a VCF file should be exported to a separate file or not. If `export_header` is
1974        True, the header will be exported to a file. If `export_header` is False, the header will not
1975        be, defaults to True, if output format is not VCF
1976        :type export_header: bool (optional)
1977        :param query: The `query` parameter is an optional SQL query that can be used to filter and
1978        select specific data from the VCF file before exporting it. If provided, only the data that
1979        matches the query will be exported
1980        :type query: str
1981        :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the
1982        columns to be used for partitioning the Parquet file during export. Partitioning is a way to
1983        organize data in a hierarchical directory structure based on the values of one or more columns.
1984        This can improve query performance when working with large datasets
1985        :type parquet_partitions: list
1986        :param chunk_size: The `chunk_size` parameter specifies the number of
1987        records in batch when exporting data in Parquet format. This parameter is used for
1988        partitioning the Parquet file into multiple files.
1989        :type chunk_size: int
1990        :param threads: The `threads` parameter is an optional parameter that specifies the number of
1991        threads to be used during the export process. It determines the level of parallelism and can
1992        improve the performance of the export operation. If not provided, the function will use the
1993        default number of threads
1994        :type threads: int
1995        :param sort: The `sort` parameter is a boolean flag that determines whether the output file
1996        should be sorted or not. If `sort` is set to `True`, the output file will be sorted based on the
1997        genomic coordinates of the variants. By default, the value of `sort` is `False`, defaults to
1998        False
1999        :type sort: bool (optional)
2000        :param index: The `index` parameter is a boolean flag that determines whether an index should be
2001        created on the output file. If `index` is True, an index will be created. If `index` is False,
2002        no index will be created. The default value is False, defaults to False
2003        :type index: bool (optional)
2004        :param order_by: The `order_by` parameter is a string that specifies the column(s) to use for
2005        sorting the output file. This parameter is only applicable when exporting data in VCF format
2006        :type order_by: str
2007        :return: a boolean value. It checks if the output file exists and returns True if it does, or
2008        None if it doesn't.
2009        """
2010
2011        # Log
2012        log.info("Exporting...")
2013
2014        # Full path
2015        output_file = full_path(output_file)
2016        output_header = full_path(output_header)
2017
2018        # Config
2019        config = self.get_config()
2020
2021        # Param
2022        param = self.get_param()
2023
2024        # Tmp files to remove
2025        tmp_to_remove = []
2026
2027        # If no output, get it
2028        if not output_file:
2029            output_file = self.get_output()
2030
2031        # If not threads
2032        if not threads:
2033            threads = self.get_threads()
2034
2035        # Auto header name with extension
2036        if export_header or output_header:
2037            if not output_header:
2038                output_header = f"{output_file}.hdr"
2039            # Export header
2040            self.export_header(output_file=output_file)
2041
2042        # Switch off export header if VCF output
2043        output_file_type = get_file_format(output_file)
2044        if output_file_type in ["vcf"]:
2045            export_header = False
2046            tmp_to_remove.append(output_header)
2047
2048        # Chunk size
2049        if not chunk_size:
2050            chunk_size = config.get("chunk_size", None)
2051
2052        # Parquet partition
2053        if not parquet_partitions:
2054            parquet_partitions = param.get("export", {}).get("parquet_partitions", None)
2055        if parquet_partitions and isinstance(parquet_partitions, str):
2056            parquet_partitions = parquet_partitions.split(",")
2057
2058        # Order by
2059        if not order_by:
2060            order_by = param.get("export", {}).get("order_by", "")
2061
2062        # Header in output
2063        header_in_output = param.get("export", {}).get("include_header", False)
2064
2065        # Database
2066        database_source = self.get_connexion()
2067
2068        # Connexion format
2069        connexion_format = self.get_connexion_format()
2070
2071        # Explode infos
2072        if self.get_explode_infos():
2073            self.explode_infos(
2074                prefix=self.get_explode_infos_prefix(),
2075                fields=self.get_explode_infos_fields(),
2076                force=False,
2077            )
2078
2079        # if connexion_format in ["sqlite"] or query:
2080        if connexion_format in ["sqlite"]:
2081
2082            # Export in Parquet
2083            random_tmp = "".join(
2084                random.choice(string.ascii_lowercase) for i in range(10)
2085            )
2086            database_source = f"""{output_file}.{random_tmp}.database_export.parquet"""
2087            tmp_to_remove.append(database_source)
2088
2089            # Table Variants
2090            table_variants = self.get_table_variants()
2091
2092            # Create export query
2093            sql_query_export_subquery = f"""
2094                SELECT * FROM {table_variants}
2095                """
2096
2097            # Write source file
2098            fp.write(database_source, self.get_query_to_df(sql_query_export_subquery))
2099
2100        # Create database
2101        database = Database(
2102            database=database_source,
2103            table="variants",
2104            header_file=output_header,
2105            conn_config=self.get_connexion_config(),
2106        )
2107
2108        # Existing colomns header
2109        # existing_columns_header = database.get_header_file_columns(output_header)
2110        existing_columns_header = database.get_header_columns_from_database()
2111
2112        # Export file
2113        database.export(
2114            output_database=output_file,
2115            output_header=output_header,
2116            existing_columns_header=existing_columns_header,
2117            parquet_partitions=parquet_partitions,
2118            chunk_size=chunk_size,
2119            threads=threads,
2120            sort=sort,
2121            index=index,
2122            header_in_output=header_in_output,
2123            order_by=order_by,
2124            query=query,
2125            export_header=export_header,
2126        )
2127
2128        # Remove
2129        remove_if_exists(tmp_to_remove)
2130
2131        return (os.path.exists(output_file) or None) and (
2132            os.path.exists(output_file) or None
2133        )

The export_output function exports data from a VCF file to a specified output file in various formats, including VCF, CSV, TSV, PSV, and Parquet.

Parameters
  • output_file: The output_file parameter is a string that specifies the name of the output file to be generated by the function. This is where the exported data will be saved
  • output_header: The output_header parameter is a string that specifies the name of the file where the header of the VCF file will be exported. If this parameter is not provided, the header will be exported to a file with the same name as the output_file parameter, but with the extension "
  • export_header: The export_header parameter is a boolean flag that determines whether the header of a VCF file should be exported to a separate file or not. If export_header is True, the header will be exported to a file. If export_header is False, the header will not be, defaults to True, if output format is not VCF
  • query: The query parameter is an optional SQL query that can be used to filter and select specific data from the VCF file before exporting it. If provided, only the data that matches the query will be exported
  • parquet_partitions: The parquet_partitions parameter is a list that specifies the columns to be used for partitioning the Parquet file during export. Partitioning is a way to organize data in a hierarchical directory structure based on the values of one or more columns. This can improve query performance when working with large datasets
  • chunk_size: The chunk_size parameter specifies the number of records in batch when exporting data in Parquet format. This parameter is used for partitioning the Parquet file into multiple files.
  • threads: The threads parameter is an optional parameter that specifies the number of threads to be used during the export process. It determines the level of parallelism and can improve the performance of the export operation. If not provided, the function will use the default number of threads
  • sort: The sort parameter is a boolean flag that determines whether the output file should be sorted or not. If sort is set to True, the output file will be sorted based on the genomic coordinates of the variants. By default, the value of sort is False, defaults to False
  • index: The index parameter is a boolean flag that determines whether an index should be created on the output file. If index is True, an index will be created. If index is False, no index will be created. The default value is False, defaults to False
  • order_by: The order_by parameter is a string that specifies the column(s) to use for sorting the output file. This parameter is only applicable when exporting data in VCF format
Returns

a boolean value. It checks if the output file exists and returns True if it does, or None if it doesn't.

def get_extra_infos(self, table: str = None) -> list:
2135    def get_extra_infos(self, table: str = None) -> list:
2136        """
2137        The `get_extra_infos` function returns a list of columns that are in a specified table but not
2138        in the header.
2139
2140        :param table: The `table` parameter in the `get_extra_infos` function is used to specify the
2141        name of the table from which you want to retrieve the extra columns that are not present in the
2142        header. If the `table` parameter is not provided when calling the function, it will default to
2143        using the variants
2144        :type table: str
2145        :return: A list of columns that are in the specified table but not in the header of the table.
2146        """
2147
2148        header_columns = []
2149
2150        if not table:
2151            table = self.get_table_variants(clause="from")
2152            header_columns = self.get_header_columns()
2153
2154        # Check all columns in the database
2155        query = f""" SELECT * FROM {table} LIMIT 1 """
2156        log.debug(f"query {query}")
2157        table_columns = self.get_query_to_df(query).columns.tolist()
2158        extra_columns = []
2159
2160        # Construct extra infos (not in header)
2161        for column in table_columns:
2162            if column not in header_columns:
2163                extra_columns.append(column)
2164
2165        return extra_columns

The get_extra_infos function returns a list of columns that are in a specified table but not in the header.

Parameters
  • table: The table parameter in the get_extra_infos function is used to specify the name of the table from which you want to retrieve the extra columns that are not present in the header. If the table parameter is not provided when calling the function, it will default to using the variants
Returns

A list of columns that are in the specified table but not in the header of the table.

def get_extra_infos_sql(self, table: str = None) -> str:
2167    def get_extra_infos_sql(self, table: str = None) -> str:
2168        """
2169        It returns a string of the extra infos, separated by commas, and each extra info is surrounded
2170        by double quotes
2171
2172        :param table: The name of the table to get the extra infos from. If None, the default table is
2173        used
2174        :type table: str
2175        :return: A string of the extra infos
2176        """
2177
2178        return ", ".join(
2179            ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)]
2180        )

It returns a string of the extra infos, separated by commas, and each extra info is surrounded by double quotes

Parameters
  • table: The name of the table to get the extra infos from. If None, the default table is used
Returns

A string of the extra infos

def export_header( self, header_name: str = None, output_file: str = None, output_file_ext: str = '.hdr', clean_header: bool = True, remove_chrom_line: bool = False) -> str:
2182    def export_header(
2183        self,
2184        header_name: str = None,
2185        output_file: str = None,
2186        output_file_ext: str = ".hdr",
2187        clean_header: bool = True,
2188        remove_chrom_line: bool = False,
2189    ) -> str:
2190        """
2191        The `export_header` function takes a VCF file, extracts the header, modifies it according to
2192        specified options, and writes it to a new file.
2193
2194        :param header_name: The `header_name` parameter is the name of the header file to be created. If
2195        this parameter is not specified, the header will be written to the output file
2196        :type header_name: str
2197        :param output_file: The `output_file` parameter in the `export_header` function is used to
2198        specify the name of the output file where the header will be written. If this parameter is not
2199        provided, the header will be written to a temporary file
2200        :type output_file: str
2201        :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a
2202        string that represents the extension of the output header file. By default, it is set to ".hdr"
2203        if not specified by the user. This extension will be appended to the `output_file` name to
2204        create the final, defaults to .hdr
2205        :type output_file_ext: str (optional)
2206        :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean
2207        flag that determines whether the header should be cleaned or not. When `clean_header` is set to
2208        `True`, the function will clean the header by modifying certain lines based on a specific
2209        pattern. If `clean_header`, defaults to True
2210        :type clean_header: bool (optional)
2211        :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a
2212        boolean flag that determines whether the #CHROM line should be removed from the header before
2213        writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `,
2214        defaults to False
2215        :type remove_chrom_line: bool (optional)
2216        :return: The function `export_header` returns the name of the temporary header file that is
2217        created.
2218        """
2219
2220        if not header_name and not output_file:
2221            output_file = self.get_output()
2222
2223        if self.get_header():
2224
2225            # Get header object
2226            header_obj = self.get_header()
2227
2228            # Create database
2229            db_for_header = Database(database=self.get_input())
2230
2231            # Get real columns in the file
2232            db_header_columns = db_for_header.get_columns()
2233
2234            with tempfile.TemporaryDirectory() as tmpdir:
2235
2236                # Write header file
2237                header_file_tmp = os.path.join(tmpdir, "header")
2238                f = open(header_file_tmp, "w")
2239                vcf.Writer(f, header_obj)
2240                f.close()
2241
2242                # Replace #CHROM line with rel columns
2243                header_list = db_for_header.read_header_file(
2244                    header_file=header_file_tmp
2245                )
2246                header_list[-1] = "\t".join(db_header_columns)
2247
2248                # Remove CHROM line
2249                if remove_chrom_line:
2250                    header_list.pop()
2251
2252                # Clean header
2253                if clean_header:
2254                    header_list_clean = []
2255                    for head in header_list:
2256                        # Clean head for malformed header
2257                        head_clean = head
2258                        head_clean = re.subn(
2259                            "##FORMAT=<ID=(.*),Number=(.*),Type=Flag",
2260                            r"##FORMAT=<ID=\1,Number=\2,Type=String",
2261                            head_clean,
2262                            2,
2263                        )[0]
2264                        # Write header
2265                        header_list_clean.append(head_clean)
2266                    header_list = header_list_clean
2267
2268            tmp_header_name = output_file + output_file_ext
2269
2270            f = open(tmp_header_name, "w")
2271            for line in header_list:
2272                f.write(line)
2273            f.close()
2274
2275        return tmp_header_name

The export_header function takes a VCF file, extracts the header, modifies it according to specified options, and writes it to a new file.

Parameters
  • header_name: The header_name parameter is the name of the header file to be created. If this parameter is not specified, the header will be written to the output file
  • output_file: The output_file parameter in the export_header function is used to specify the name of the output file where the header will be written. If this parameter is not provided, the header will be written to a temporary file
  • output_file_ext: The output_file_ext parameter in the export_header function is a string that represents the extension of the output header file. By default, it is set to ".hdr" if not specified by the user. This extension will be appended to the output_file name to create the final, defaults to .hdr
  • clean_header: The clean_header parameter in the export_header function is a boolean flag that determines whether the header should be cleaned or not. When clean_header is set to True, the function will clean the header by modifying certain lines based on a specific pattern. If clean_header, defaults to True
  • remove_chrom_line: The remove_chrom_line parameter in the export_header function is a boolean flag that determines whether the #CHROM line should be removed from the header before writing it to the output file. If set to True, the #CHROM line will be removed; if set to `, defaults to False
Returns

The function export_header returns the name of the temporary header file that is created.

def export_variant_vcf( self, vcf_file, remove_info: bool = False, add_samples: bool = True, list_samples: list = [], where_clause: str = '', index: bool = False, threads: int | None = None) -> bool | None:
2277    def export_variant_vcf(
2278        self,
2279        vcf_file,
2280        remove_info: bool = False,
2281        add_samples: bool = True,
2282        list_samples: list = [],
2283        where_clause: str = "",
2284        index: bool = False,
2285        threads: int | None = None,
2286    ) -> bool | None:
2287        """
2288        The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to
2289        remove INFO field, add samples, and control compression and indexing.
2290
2291        :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be
2292        written to. It is the output file that will contain the filtered VCF data based on the specified
2293        parameters
2294        :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a
2295        boolean flag that determines whether to remove the INFO field from the output VCF file. If set
2296        to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included
2297        in, defaults to False
2298        :type remove_info: bool (optional)
2299        :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether
2300        the samples should be added to the VCF file or not. If set to True, the samples will be added.
2301        If set to False, the samples will be removed. The default value is True, defaults to True
2302        :type add_samples: bool (optional)
2303        :param list_samples: The `list_samples` parameter is a list of samples that you want to include
2304        in the output VCF file. By default, all samples will be included. If you provide a list of
2305        samples, only those samples will be included in the output file
2306        :type list_samples: list
2307        :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that
2308        determines whether or not to create an index for the output VCF file. If `index` is set to
2309        `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False
2310        :type index: bool (optional)
2311        :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the
2312        number of threads to use for exporting the VCF file. It determines how many parallel threads
2313        will be used during the export process. More threads can potentially speed up the export process
2314        by utilizing multiple cores of the processor. If
2315        :type threads: int | None
2316        :return: The `export_variant_vcf` function returns the result of calling the `export_output`
2317        method with various parameters including the output file, query, threads, sort flag, and index
2318        flag. The `export_output` method is responsible for exporting the VCF data based on the
2319        specified parameters and configurations provided in the `export_variant_vcf` function.
2320        """
2321
2322        # Config
2323        config = self.get_config()
2324
2325        # Extract VCF
2326        log.debug("Export VCF...")
2327
2328        # Table variants
2329        table_variants = self.get_table_variants()
2330
2331        # Threads
2332        if not threads:
2333            threads = self.get_threads()
2334
2335        # Info fields
2336        if remove_info:
2337            if not isinstance(remove_info, str):
2338                remove_info = "."
2339            info_field = f"""'{remove_info}' as INFO"""
2340        else:
2341            info_field = "INFO"
2342
2343        # Samples fields
2344        if add_samples:
2345            if not list_samples:
2346                list_samples = self.get_header_sample_list()
2347            if list_samples:
2348                samples_fields = " , FORMAT , " + " , ".join(list_samples)
2349            else:
2350                samples_fields = ""
2351            log.debug(f"samples_fields: {samples_fields}")
2352        else:
2353            samples_fields = ""
2354
2355        # Where clause
2356        if where_clause is None:
2357            where_clause = ""
2358
2359        # Variants
2360        select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """
2361        sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """
2362        log.debug(f"sql_query_select={sql_query_select}")
2363
2364        return self.export_output(
2365            output_file=vcf_file,
2366            output_header=None,
2367            export_header=True,
2368            query=sql_query_select,
2369            parquet_partitions=None,
2370            chunk_size=config.get("chunk_size", None),
2371            threads=threads,
2372            sort=True,
2373            index=index,
2374            order_by=None,
2375        )

The export_variant_vcf function exports a VCF file with specified samples, allowing options to remove INFO field, add samples, and control compression and indexing.

Parameters
  • vcf_file: The vcf_file parameter is the name of the file where the VCF data will be written to. It is the output file that will contain the filtered VCF data based on the specified parameters
  • remove_info: The remove_info parameter in the export_variant_vcf function is a boolean flag that determines whether to remove the INFO field from the output VCF file. If set to True, the INFO field will be removed. If set to False, the INFO field will be included in, defaults to False
  • add_samples: The add_samples parameter is a boolean parameter that determines whether the samples should be added to the VCF file or not. If set to True, the samples will be added. If set to False, the samples will be removed. The default value is True, defaults to True
  • list_samples: The list_samples parameter is a list of samples that you want to include in the output VCF file. By default, all samples will be included. If you provide a list of samples, only those samples will be included in the output file
  • index: The index parameter in the export_variant_vcf function is a boolean flag that determines whether or not to create an index for the output VCF file. If index is set to True, the output VCF file will be indexed using tabix. If index, defaults to False
  • threads: The threads parameter in the export_variant_vcf function specifies the number of threads to use for exporting the VCF file. It determines how many parallel threads will be used during the export process. More threads can potentially speed up the export process by utilizing multiple cores of the processor. If
Returns

The export_variant_vcf function returns the result of calling the export_output method with various parameters including the output file, query, threads, sort flag, and index flag. The export_output method is responsible for exporting the VCF data based on the specified parameters and configurations provided in the export_variant_vcf function.

def run_commands(self, commands: list = [], threads: int = 1) -> None:
2377    def run_commands(self, commands: list = [], threads: int = 1) -> None:
2378        """
2379        It takes a list of commands and runs them in parallel using the number of threads specified
2380
2381        :param commands: A list of commands to run
2382        :param threads: The number of threads to use, defaults to 1 (optional)
2383        """
2384
2385        run_parallel_commands(commands, threads)

It takes a list of commands and runs them in parallel using the number of threads specified

Parameters
  • commands: A list of commands to run
  • threads: The number of threads to use, defaults to 1 (optional)
def get_threads(self, default: int = 1) -> int:
2387    def get_threads(self, default: int = 1) -> int:
2388        """
2389        This function returns the number of threads to use for a job, with a default value of 1 if not
2390        specified.
2391
2392        :param default: The `default` parameter in the `get_threads` method is used to specify the
2393        default number of threads to use if no specific value is provided. If no value is provided for
2394        the `threads` parameter in the configuration or input parameters, the `default` value will be
2395        used, defaults to 1
2396        :type default: int (optional)
2397        :return: the number of threads to use for the current job.
2398        """
2399
2400        # Config
2401        config = self.get_config()
2402
2403        # Param
2404        param = self.get_param()
2405
2406        # Input threads
2407        input_thread = param.get("threads", config.get("threads", None))
2408
2409        # Check threads
2410        if not input_thread:
2411            threads = default
2412        elif int(input_thread) <= 0:
2413            threads = os.cpu_count()
2414        else:
2415            threads = int(input_thread)
2416        return threads

This function returns the number of threads to use for a job, with a default value of 1 if not specified.

Parameters
  • default: The default parameter in the get_threads method is used to specify the default number of threads to use if no specific value is provided. If no value is provided for the threads parameter in the configuration or input parameters, the default value will be used, defaults to 1
Returns

the number of threads to use for the current job.

def get_memory(self, default: str = None) -> str:
2418    def get_memory(self, default: str = None) -> str:
2419        """
2420        This function retrieves the memory value from parameters or configuration with a default value
2421        if not found.
2422
2423        :param default: The `get_memory` function takes in a default value as a string parameter. This
2424        default value is used as a fallback in case the `memory` parameter is not provided in the
2425        `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary,
2426        the function
2427        :type default: str
2428        :return: The `get_memory` function returns a string value representing the memory parameter. If
2429        the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will
2430        return the default value provided as an argument to the function.
2431        """
2432
2433        # Config
2434        config = self.get_config()
2435
2436        # Param
2437        param = self.get_param()
2438
2439        # Input threads
2440        input_memory = param.get("memory", config.get("memory", None))
2441
2442        # Check threads
2443        if input_memory:
2444            memory = input_memory
2445        else:
2446            memory = default
2447
2448        return memory

This function retrieves the memory value from parameters or configuration with a default value if not found.

Parameters
  • default: The get_memory function takes in a default value as a string parameter. This default value is used as a fallback in case the memory parameter is not provided in the param dictionary or the config dictionary. If memory is not found in either dictionary, the function
Returns

The get_memory function returns a string value representing the memory parameter. If the input_memory is provided in the parameters, it will return that value. Otherwise, it will return the default value provided as an argument to the function.

def update_from_vcf(self, vcf_file: str) -> None:
2450    def update_from_vcf(self, vcf_file: str) -> None:
2451        """
2452        > If the database is duckdb, then use the parquet method, otherwise use the sqlite method
2453
2454        :param vcf_file: the path to the VCF file
2455        """
2456
2457        connexion_format = self.get_connexion_format()
2458
2459        if connexion_format in ["duckdb"]:
2460            self.update_from_vcf_duckdb(vcf_file)
2461        elif connexion_format in ["sqlite"]:
2462            self.update_from_vcf_sqlite(vcf_file)

If the database is duckdb, then use the parquet method, otherwise use the sqlite method

Parameters
  • vcf_file: the path to the VCF file
def update_from_vcf_duckdb(self, vcf_file: str) -> None:
2464    def update_from_vcf_duckdb(self, vcf_file: str) -> None:
2465        """
2466        It takes a VCF file and updates the INFO column of the variants table in the database with the
2467        INFO column of the VCF file
2468
2469        :param vcf_file: the path to the VCF file
2470        """
2471
2472        # varaints table
2473        table_variants = self.get_table_variants()
2474
2475        # Loading VCF into temporaire table
2476        skip = self.get_header_length(file=vcf_file)
2477        vcf_df = pd.read_csv(
2478            vcf_file,
2479            sep="\t",
2480            engine="c",
2481            skiprows=skip,
2482            header=0,
2483            low_memory=False,
2484        )
2485        sql_query_update = f"""
2486        UPDATE {table_variants} as table_variants
2487            SET INFO = concat(
2488                            CASE
2489                                WHEN INFO NOT IN ('', '.')
2490                                THEN INFO
2491                                ELSE ''
2492                            END,
2493                            (
2494                                SELECT 
2495                                    concat(
2496                                        CASE
2497                                            WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.')
2498                                            THEN ';'
2499                                            ELSE ''
2500                                        END
2501                                        ,
2502                                        CASE
2503                                            WHEN table_parquet.INFO NOT IN ('','.')
2504                                            THEN table_parquet.INFO
2505                                            ELSE ''
2506                                        END
2507                                    )
2508                                FROM vcf_df as table_parquet
2509                                        WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR)
2510                                        AND table_parquet.\"POS\" = table_variants.\"POS\"
2511                                        AND table_parquet.\"ALT\" = table_variants.\"ALT\"
2512                                        AND table_parquet.\"REF\" = table_variants.\"REF\"
2513                                        AND table_parquet.INFO NOT IN ('','.')
2514                            )
2515                        )
2516            ;
2517            """
2518        self.conn.execute(sql_query_update)

It takes a VCF file and updates the INFO column of the variants table in the database with the INFO column of the VCF file

Parameters
  • vcf_file: the path to the VCF file
def update_from_vcf_sqlite(self, vcf_file: str) -> None:
2520    def update_from_vcf_sqlite(self, vcf_file: str) -> None:
2521        """
2522        It creates a temporary table in the SQLite database, loads the VCF file into the temporary
2523        table, then updates the INFO column of the variants table with the INFO column of the temporary
2524        table
2525
2526        :param vcf_file: The path to the VCF file you want to update the database with
2527        """
2528
2529        # Create a temporary table for the VCF
2530        table_vcf = "tmp_vcf"
2531        sql_create = (
2532            f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0"
2533        )
2534        self.conn.execute(sql_create)
2535
2536        # Loading VCF into temporaire table
2537        vcf_df = pd.read_csv(
2538            vcf_file, sep="\t", comment="#", header=None, low_memory=False
2539        )
2540        vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]
2541        vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False)
2542
2543        # Update table 'variants' with VCF data
2544        # warning: CONCAT as || operator
2545        sql_query_update = f"""
2546            UPDATE variants as table_variants
2547            SET INFO = CASE
2548                            WHEN INFO NOT IN ('', '.')
2549                            THEN INFO
2550                            ELSE ''
2551                        END ||
2552                        (
2553                        SELECT 
2554                            CASE 
2555                                WHEN table_variants.INFO NOT IN ('','.') 
2556                                    AND table_vcf.INFO NOT IN ('','.')  
2557                                THEN ';' 
2558                                ELSE '' 
2559                            END || 
2560                            CASE 
2561                                WHEN table_vcf.INFO NOT IN ('','.') 
2562                                THEN table_vcf.INFO 
2563                                ELSE '' 
2564                            END
2565                        FROM {table_vcf} as table_vcf
2566                        WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\"
2567                            AND table_vcf.\"POS\" = table_variants.\"POS\"
2568                            AND table_vcf.\"ALT\" = table_variants.\"ALT\"
2569                            AND table_vcf.\"REF\" = table_variants.\"REF\"
2570                        )
2571        """
2572        self.conn.execute(sql_query_update)
2573
2574        # Drop temporary table
2575        sql_drop = f"DROP TABLE {table_vcf}"
2576        self.conn.execute(sql_drop)

It creates a temporary table in the SQLite database, loads the VCF file into the temporary table, then updates the INFO column of the variants table with the INFO column of the temporary table

Parameters
  • vcf_file: The path to the VCF file you want to update the database with
def drop_variants_table(self) -> None:
2578    def drop_variants_table(self) -> None:
2579        """
2580        > This function drops the variants table
2581        """
2582
2583        table_variants = self.get_table_variants()
2584        sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}"
2585        self.conn.execute(sql_table_variants)

This function drops the variants table

def set_variant_id(self, variant_id_column: str = 'variant_id', force: bool = None) -> str:
2587    def set_variant_id(
2588        self, variant_id_column: str = "variant_id", force: bool = None
2589    ) -> str:
2590        """
2591        It adds a column to the variants table called `variant_id` and populates it with a hash of the
2592        `#CHROM`, `POS`, `REF`, and `ALT` columns
2593
2594        :param variant_id_column: The name of the column to be created in the variants table, defaults
2595        to variant_id
2596        :type variant_id_column: str (optional)
2597        :param force: If True, the variant_id column will be created even if it already exists
2598        :type force: bool
2599        :return: The name of the column that contains the variant_id
2600        """
2601
2602        # Assembly
2603        assembly = self.get_param().get(
2604            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
2605        )
2606
2607        # INFO/Tag prefix
2608        prefix = self.get_explode_infos_prefix()
2609
2610        # Explode INFO/SVTYPE
2611        added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"])
2612
2613        # variants table
2614        table_variants = self.get_table_variants()
2615
2616        # variant_id column
2617        if not variant_id_column:
2618            variant_id_column = "variant_id"
2619
2620        # Creta variant_id column
2621        if "variant_id" not in self.get_extra_infos() or force:
2622
2623            # Create column
2624            self.add_column(
2625                table_name=table_variants,
2626                column_name=variant_id_column,
2627                column_type="UBIGINT",
2628                default_value="0",
2629            )
2630
2631            # Update column
2632            self.conn.execute(
2633                f"""
2634                    UPDATE {table_variants}
2635                    SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"')
2636                """
2637            )
2638
2639        # Remove added columns
2640        for added_column in added_columns:
2641            self.drop_column(column=added_column)
2642
2643        # return variant_id column name
2644        return variant_id_column

It adds a column to the variants table called variant_id and populates it with a hash of the #CHROM, POS, REF, and ALT columns

Parameters
  • variant_id_column: The name of the column to be created in the variants table, defaults to variant_id
  • force: If True, the variant_id column will be created even if it already exists
Returns

The name of the column that contains the variant_id

def get_variant_id_column(self, variant_id_column: str = 'variant_id', force: bool = None) -> str:
2646    def get_variant_id_column(
2647        self, variant_id_column: str = "variant_id", force: bool = None
2648    ) -> str:
2649        """
2650        This function returns the variant_id column name
2651
2652        :param variant_id_column: The name of the column in the dataframe that contains the variant IDs,
2653        defaults to variant_id
2654        :type variant_id_column: str (optional)
2655        :param force: If True, will force the variant_id to be set to the value of variant_id_column. If
2656        False, will only set the variant_id if it is not already set. If None, will set the variant_id
2657        if it is not already set, or if it is set
2658        :type force: bool
2659        :return: The variant_id column name.
2660        """
2661
2662        return self.set_variant_id(variant_id_column=variant_id_column, force=force)

This function returns the variant_id column name

Parameters
  • variant_id_column: The name of the column in the dataframe that contains the variant IDs, defaults to variant_id
  • force: If True, will force the variant_id to be set to the value of variant_id_column. If False, will only set the variant_id if it is not already set. If None, will set the variant_id if it is not already set, or if it is set
Returns

The variant_id column name.

def scan_databases( self, database_formats: list = ['parquet'], database_releases: list = ['current']) -> dict:
2668    def scan_databases(
2669        self,
2670        database_formats: list = ["parquet"],
2671        database_releases: list = ["current"],
2672    ) -> dict:
2673        """
2674        The function `scan_databases` scans for available databases based on specified formats and
2675        releases.
2676
2677        :param database_formats: The `database_formats` parameter is a list that specifies the formats
2678        of the databases to be scanned. In this case, the accepted format is "parquet"
2679        :type database_formats: list ["parquet"]
2680        :param database_releases: The `database_releases` parameter is a list that specifies the
2681        releases of the databases to be scanned. In the provided function, the default value for
2682        `database_releases` is set to `["current"]`, meaning that by default, the function will scan
2683        databases that are in the "current"
2684        :type database_releases: list
2685        :return: The function `scan_databases` returns a dictionary containing information about
2686        databases that match the specified formats and releases.
2687        """
2688
2689        # Config
2690        config = self.get_config()
2691
2692        # Param
2693        param = self.get_param()
2694
2695        # Param - Assembly
2696        assembly = param.get("assembly", config.get("assembly", None))
2697        if not assembly:
2698            assembly = DEFAULT_ASSEMBLY
2699            log.warning(f"Default assembly '{assembly}'")
2700
2701        # Scan for availabled databases
2702        log.info(
2703            f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..."
2704        )
2705        databases_infos_dict = databases_infos(
2706            database_folder_releases=database_releases,
2707            database_formats=database_formats,
2708            assembly=assembly,
2709            config=config,
2710        )
2711        log.info(
2712            f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found"
2713        )
2714
2715        return databases_infos_dict

The function scan_databases scans for available databases based on specified formats and releases.

Parameters
  • database_formats: The database_formats parameter is a list that specifies the formats of the databases to be scanned. In this case, the accepted format is "parquet"
  • database_releases: The database_releases parameter is a list that specifies the releases of the databases to be scanned. In the provided function, the default value for database_releases is set to ["current"], meaning that by default, the function will scan databases that are in the "current"
Returns

The function scan_databases returns a dictionary containing information about databases that match the specified formats and releases.

def annotation(self) -> None:
2717    def annotation(self) -> None:
2718        """
2719        It annotates the VCF file with the annotations specified in the config file.
2720        """
2721
2722        # Config
2723        config = self.get_config()
2724
2725        # Param
2726        param = self.get_param()
2727
2728        # Param - Assembly
2729        assembly = param.get("assembly", config.get("assembly", None))
2730        if not assembly:
2731            assembly = DEFAULT_ASSEMBLY
2732            log.warning(f"Default assembly '{assembly}'")
2733
2734        # annotations databases folders
2735        annotations_databases = set(
2736            config.get("folders", {})
2737            .get("databases", {})
2738            .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER])
2739            + config.get("folders", {})
2740            .get("databases", {})
2741            .get("parquet", ["~/howard/databases/parquet/current"])
2742            + config.get("folders", {})
2743            .get("databases", {})
2744            .get("bcftools", ["~/howard/databases/bcftools/current"])
2745        )
2746
2747        # Get param annotations
2748        if param.get("annotations", None) and isinstance(
2749            param.get("annotations", None), str
2750        ):
2751            log.debug(param.get("annotations", None))
2752            param_annotation_list = param.get("annotations").split(",")
2753        else:
2754            param_annotation_list = []
2755
2756        # Each tools param
2757        if param.get("annotation_parquet", None) != None:
2758            log.debug(
2759                f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}"""
2760            )
2761            if isinstance(param.get("annotation_parquet", None), list):
2762                param_annotation_list.append(",".join(param.get("annotation_parquet")))
2763            else:
2764                param_annotation_list.append(param.get("annotation_parquet"))
2765        if param.get("annotation_snpsift", None) != None:
2766            if isinstance(param.get("annotation_snpsift", None), list):
2767                param_annotation_list.append(
2768                    "snpsift:"
2769                    + "+".join(param.get("annotation_snpsift")).replace(",", "+")
2770                )
2771            else:
2772                param_annotation_list.append(
2773                    "snpsift:" + param.get("annotation_snpsift").replace(",", "+")
2774                )
2775        if param.get("annotation_snpeff", None) != None:
2776            param_annotation_list.append("snpeff:" + param.get("annotation_snpeff"))
2777        if param.get("annotation_bcftools", None) != None:
2778            if isinstance(param.get("annotation_bcftools", None), list):
2779                param_annotation_list.append(
2780                    "bcftools:"
2781                    + "+".join(param.get("annotation_bcftools")).replace(",", "+")
2782                )
2783            else:
2784                param_annotation_list.append(
2785                    "bcftools:" + param.get("annotation_bcftools").replace(",", "+")
2786                )
2787        if param.get("annotation_annovar", None) != None:
2788            param_annotation_list.append("annovar:" + param.get("annotation_annovar"))
2789        if param.get("annotation_exomiser", None) != None:
2790            param_annotation_list.append("exomiser:" + param.get("annotation_exomiser"))
2791        if param.get("annotation_splice", None) != None:
2792            param_annotation_list.append("splice:" + param.get("annotation_splice"))
2793
2794        # Merge param annotations list
2795        param["annotations"] = ",".join(param_annotation_list)
2796
2797        # debug
2798        log.debug(f"param_annotations={param['annotations']}")
2799
2800        if param.get("annotations"):
2801
2802            # Log
2803            # log.info("Annotations - Check annotation parameters")
2804
2805            if not "annotation" in param:
2806                param["annotation"] = {}
2807
2808            # List of annotations parameters
2809            annotations_list_input = {}
2810            if isinstance(param.get("annotations", None), str):
2811                annotation_file_list = [
2812                    value for value in param.get("annotations", "").split(",")
2813                ]
2814                for annotation_file in annotation_file_list:
2815                    annotations_list_input[annotation_file] = {"INFO": None}
2816            else:
2817                annotations_list_input = param.get("annotations", {})
2818
2819            log.info(f"Quick Annotations:")
2820            for annotation_key in list(annotations_list_input.keys()):
2821                log.info(f"   {annotation_key}")
2822
2823            # List of annotations and associated fields
2824            annotations_list = {}
2825
2826            for annotation_file in annotations_list_input:
2827
2828                # Explode annotations if ALL
2829                if (
2830                    annotation_file.upper() == "ALL"
2831                    or annotation_file.upper().startswith("ALL:")
2832                ):
2833
2834                    # check ALL parameters (formats, releases)
2835                    annotation_file_split = annotation_file.split(":")
2836                    database_formats = "parquet"
2837                    database_releases = "current"
2838                    for annotation_file_option in annotation_file_split[1:]:
2839                        database_all_options_split = annotation_file_option.split("=")
2840                        if database_all_options_split[0] == "format":
2841                            database_formats = database_all_options_split[1].split("+")
2842                        if database_all_options_split[0] == "release":
2843                            database_releases = database_all_options_split[1].split("+")
2844
2845                    # Scan for availabled databases
2846                    databases_infos_dict = self.scan_databases(
2847                        database_formats=database_formats,
2848                        database_releases=database_releases,
2849                    )
2850
2851                    # Add found databases in annotation parameters
2852                    for database_infos in databases_infos_dict.keys():
2853                        annotations_list[database_infos] = {"INFO": None}
2854
2855                else:
2856                    annotations_list[annotation_file] = annotations_list_input[
2857                        annotation_file
2858                    ]
2859
2860            # Check each databases
2861            if len(annotations_list):
2862
2863                log.info(
2864                    f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..."
2865                )
2866
2867                for annotation_file in annotations_list:
2868
2869                    # Init
2870                    annotations = annotations_list.get(annotation_file, None)
2871
2872                    # Annotation snpEff
2873                    if annotation_file.startswith("snpeff"):
2874
2875                        log.debug(f"Quick Annotation snpEff")
2876
2877                        if "snpeff" not in param["annotation"]:
2878                            param["annotation"]["snpeff"] = {}
2879
2880                        if "options" not in param["annotation"]["snpeff"]:
2881                            param["annotation"]["snpeff"]["options"] = ""
2882
2883                        # snpEff options in annotations
2884                        param["annotation"]["snpeff"]["options"] = "".join(
2885                            annotation_file.split(":")[1:]
2886                        )
2887
2888                    # Annotation Annovar
2889                    elif annotation_file.startswith("annovar"):
2890
2891                        log.debug(f"Quick Annotation Annovar")
2892
2893                        if "annovar" not in param["annotation"]:
2894                            param["annotation"]["annovar"] = {}
2895
2896                        if "annotations" not in param["annotation"]["annovar"]:
2897                            param["annotation"]["annovar"]["annotations"] = {}
2898
2899                        # Options
2900                        annotation_file_split = annotation_file.split(":")
2901                        for annotation_file_annotation in annotation_file_split[1:]:
2902                            if annotation_file_annotation:
2903                                param["annotation"]["annovar"]["annotations"][
2904                                    annotation_file_annotation
2905                                ] = annotations
2906
2907                    # Annotation Exomiser
2908                    elif annotation_file.startswith("exomiser"):
2909
2910                        log.debug(f"Quick Annotation Exomiser")
2911
2912                        param["annotation"]["exomiser"] = params_string_to_dict(
2913                            annotation_file
2914                        )
2915
2916                    # Annotation Splice
2917                    elif annotation_file.startswith("splice"):
2918
2919                        log.debug(f"Quick Annotation Splice")
2920
2921                        param["annotation"]["splice"] = params_string_to_dict(
2922                            annotation_file
2923                        )
2924
2925                    # Annotation Parquet or BCFTOOLS
2926                    else:
2927
2928                        # Tools detection
2929                        if annotation_file.startswith("bcftools:"):
2930                            annotation_tool_initial = "bcftools"
2931                            annotation_file = ":".join(annotation_file.split(":")[1:])
2932                        elif annotation_file.startswith("snpsift:"):
2933                            annotation_tool_initial = "snpsift"
2934                            annotation_file = ":".join(annotation_file.split(":")[1:])
2935                        else:
2936                            annotation_tool_initial = None
2937
2938                        # list of files
2939                        annotation_file_list = annotation_file.replace("+", ":").split(
2940                            ":"
2941                        )
2942
2943                        for annotation_file in annotation_file_list:
2944
2945                            if annotation_file:
2946
2947                                # Annotation tool initial
2948                                annotation_tool = annotation_tool_initial
2949
2950                                # Find file
2951                                annotation_file_found = None
2952
2953                                # Expand user
2954                                annotation_file = full_path(annotation_file)
2955
2956                                if os.path.exists(annotation_file):
2957                                    annotation_file_found = annotation_file
2958
2959                                else:
2960                                    # Find within assembly folders
2961                                    for annotations_database in annotations_databases:
2962                                        found_files = find_all(
2963                                            annotation_file,
2964                                            os.path.join(
2965                                                annotations_database, assembly
2966                                            ),
2967                                        )
2968                                        if len(found_files) > 0:
2969                                            annotation_file_found = found_files[0]
2970                                            break
2971                                    if not annotation_file_found and not assembly:
2972                                        # Find within folders
2973                                        for (
2974                                            annotations_database
2975                                        ) in annotations_databases:
2976                                            found_files = find_all(
2977                                                annotation_file, annotations_database
2978                                            )
2979                                            if len(found_files) > 0:
2980                                                annotation_file_found = found_files[0]
2981                                                break
2982                                log.debug(
2983                                    f"for {annotation_file} annotation_file_found={annotation_file_found}"
2984                                )
2985
2986                                # Full path
2987                                annotation_file_found = full_path(annotation_file_found)
2988
2989                                if annotation_file_found:
2990
2991                                    database = Database(database=annotation_file_found)
2992                                    quick_annotation_format = database.get_format()
2993                                    quick_annotation_is_compressed = (
2994                                        database.is_compressed()
2995                                    )
2996                                    quick_annotation_is_indexed = os.path.exists(
2997                                        f"{annotation_file_found}.tbi"
2998                                    )
2999                                    bcftools_preference = False
3000
3001                                    # Check Annotation Tool
3002                                    if not annotation_tool:
3003                                        if (
3004                                            bcftools_preference
3005                                            and quick_annotation_format
3006                                            in ["vcf", "bed"]
3007                                            and quick_annotation_is_compressed
3008                                            and quick_annotation_is_indexed
3009                                        ):
3010                                            annotation_tool = "bcftools"
3011                                        elif quick_annotation_format in [
3012                                            "vcf",
3013                                            "bed",
3014                                            "tsv",
3015                                            "tsv",
3016                                            "csv",
3017                                            "json",
3018                                            "tbl",
3019                                            "parquet",
3020                                            "duckdb",
3021                                        ]:
3022                                            annotation_tool = "parquet"
3023                                        else:
3024                                            log.error(
3025                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
3026                                            )
3027                                            raise ValueError(
3028                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
3029                                            )
3030
3031                                    log.debug(
3032                                        f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}"
3033                                    )
3034
3035                                    # Annotation Tool dispatch
3036                                    if annotation_tool:
3037                                        if annotation_tool not in param["annotation"]:
3038                                            param["annotation"][annotation_tool] = {}
3039                                        if (
3040                                            "annotations"
3041                                            not in param["annotation"][annotation_tool]
3042                                        ):
3043                                            param["annotation"][annotation_tool][
3044                                                "annotations"
3045                                            ] = {}
3046                                        param["annotation"][annotation_tool][
3047                                            "annotations"
3048                                        ][annotation_file_found] = annotations
3049
3050                                else:
3051                                    log.error(
3052                                        f"Quick Annotation File {annotation_file} does NOT exist"
3053                                    )
3054
3055                self.set_param(param)
3056
3057        if param.get("annotation", None):
3058            log.info("Annotations")
3059            if param.get("annotation", {}).get("parquet", None):
3060                log.info("Annotations 'parquet'...")
3061                self.annotation_parquet()
3062            if param.get("annotation", {}).get("bcftools", None):
3063                log.info("Annotations 'bcftools'...")
3064                self.annotation_bcftools()
3065            if param.get("annotation", {}).get("snpsift", None):
3066                log.info("Annotations 'snpsift'...")
3067                self.annotation_snpsift()
3068            if param.get("annotation", {}).get("annovar", None):
3069                log.info("Annotations 'annovar'...")
3070                self.annotation_annovar()
3071            if param.get("annotation", {}).get("snpeff", None):
3072                log.info("Annotations 'snpeff'...")
3073                self.annotation_snpeff()
3074            if param.get("annotation", {}).get("exomiser", None) is not None:
3075                log.info("Annotations 'exomiser'...")
3076                self.annotation_exomiser()
3077            if param.get("annotation", {}).get("splice", None) is not None:
3078                log.info("Annotations 'splice' ...")
3079                self.annotation_splice()
3080
3081        # Explode INFOS fields into table fields
3082        if self.get_explode_infos():
3083            self.explode_infos(
3084                prefix=self.get_explode_infos_prefix(),
3085                fields=self.get_explode_infos_fields(),
3086                force=True,
3087            )

It annotates the VCF file with the annotations specified in the config file.

def annotation_snpsift(self, threads: int = None) -> None:
3089    def annotation_snpsift(self, threads: int = None) -> None:
3090        """
3091        This function annotate with bcftools
3092
3093        :param threads: Number of threads to use
3094        :return: the value of the variable "return_value".
3095        """
3096
3097        # DEBUG
3098        log.debug("Start annotation with bcftools databases")
3099
3100        # Threads
3101        if not threads:
3102            threads = self.get_threads()
3103        log.debug("Threads: " + str(threads))
3104
3105        # Config
3106        config = self.get_config()
3107        log.debug("Config: " + str(config))
3108
3109        # Config - snpSift
3110        snpsift_bin_command = get_bin_command(
3111            bin="SnpSift.jar",
3112            tool="snpsift",
3113            bin_type="jar",
3114            config=config,
3115            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
3116        )
3117        if not snpsift_bin_command:
3118            msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'"
3119            log.error(msg_err)
3120            raise ValueError(msg_err)
3121
3122        # Config - bcftools
3123        bcftools_bin_command = get_bin_command(
3124            bin="bcftools",
3125            tool="bcftools",
3126            bin_type="bin",
3127            config=config,
3128            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
3129        )
3130        if not bcftools_bin_command:
3131            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
3132            log.error(msg_err)
3133            raise ValueError(msg_err)
3134
3135        # Config - BCFTools databases folders
3136        databases_folders = set(
3137            self.get_config()
3138            .get("folders", {})
3139            .get("databases", {})
3140            .get("annotations", ["."])
3141            + self.get_config()
3142            .get("folders", {})
3143            .get("databases", {})
3144            .get("bcftools", ["."])
3145        )
3146        log.debug("Databases annotations: " + str(databases_folders))
3147
3148        # Param
3149        annotations = (
3150            self.get_param()
3151            .get("annotation", {})
3152            .get("snpsift", {})
3153            .get("annotations", None)
3154        )
3155        log.debug("Annotations: " + str(annotations))
3156
3157        # Assembly
3158        assembly = self.get_param().get(
3159            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
3160        )
3161
3162        # Data
3163        table_variants = self.get_table_variants()
3164
3165        # Check if not empty
3166        log.debug("Check if not empty")
3167        sql_query_chromosomes = (
3168            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
3169        )
3170        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
3171        if not sql_query_chromosomes_df["count"][0]:
3172            log.info(f"VCF empty")
3173            return
3174
3175        # VCF header
3176        vcf_reader = self.get_header()
3177        log.debug("Initial header: " + str(vcf_reader.infos))
3178
3179        # Existing annotations
3180        for vcf_annotation in self.get_header().infos:
3181
3182            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
3183            log.debug(
3184                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
3185            )
3186
3187        if annotations:
3188
3189            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
3190
3191                # Export VCF file
3192                tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz")
3193
3194                # Init
3195                commands = {}
3196
3197                for annotation in annotations:
3198                    annotation_fields = annotations[annotation]
3199
3200                    # Annotation Name
3201                    annotation_name = os.path.basename(annotation)
3202
3203                    if not annotation_fields:
3204                        annotation_fields = {"INFO": None}
3205
3206                    log.debug(f"Annotation '{annotation_name}'")
3207                    log.debug(
3208                        f"Annotation '{annotation_name}' - fields: {annotation_fields}"
3209                    )
3210
3211                    # Create Database
3212                    database = Database(
3213                        database=annotation,
3214                        databases_folders=databases_folders,
3215                        assembly=assembly,
3216                    )
3217
3218                    # Find files
3219                    db_file = database.get_database()
3220                    db_file = full_path(db_file)
3221                    db_hdr_file = database.get_header_file()
3222                    db_hdr_file = full_path(db_hdr_file)
3223                    db_file_type = database.get_format()
3224                    db_tbi_file = f"{db_file}.tbi"
3225                    db_file_compressed = database.is_compressed()
3226
3227                    # Check if compressed
3228                    if not db_file_compressed:
3229                        log.error(
3230                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
3231                        )
3232                        raise ValueError(
3233                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
3234                        )
3235
3236                    # Check if indexed
3237                    if not os.path.exists(db_tbi_file):
3238                        log.error(
3239                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
3240                        )
3241                        raise ValueError(
3242                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
3243                        )
3244
3245                    # Check index - try to create if not exists
3246                    if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
3247                        log.error("Annotation failed: database not valid")
3248                        log.error(f"Annotation annotation file: {db_file}")
3249                        log.error(f"Annotation annotation header: {db_hdr_file}")
3250                        log.error(f"Annotation annotation index: {db_tbi_file}")
3251                        raise ValueError(
3252                            f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
3253                        )
3254                    else:
3255
3256                        log.debug(
3257                            f"Annotation '{annotation}' - file: "
3258                            + str(db_file)
3259                            + " and "
3260                            + str(db_hdr_file)
3261                        )
3262
3263                        # Load header as VCF object
3264                        db_hdr_vcf = Variants(input=db_hdr_file)
3265                        db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
3266                        log.debug(
3267                            "Annotation database header: "
3268                            + str(db_hdr_vcf_header_infos)
3269                        )
3270
3271                        # For all fields in database
3272                        annotation_fields_full = False
3273                        if "ALL" in annotation_fields or "INFO" in annotation_fields:
3274                            annotation_fields = {
3275                                key: key for key in db_hdr_vcf_header_infos
3276                            }
3277                            log.debug(
3278                                "Annotation database header - All annotations added: "
3279                                + str(annotation_fields)
3280                            )
3281                            annotation_fields_full = True
3282
3283                        # # Create file for field rename
3284                        # log.debug("Create file for field rename")
3285                        # tmp_rename = NamedTemporaryFile(
3286                        #     prefix=self.get_prefix(),
3287                        #     dir=self.get_tmp_dir(),
3288                        #     suffix=".rename",
3289                        #     delete=False,
3290                        # )
3291                        # tmp_rename_name = tmp_rename.name
3292                        # tmp_files.append(tmp_rename_name)
3293
3294                        # Number of fields
3295                        nb_annotation_field = 0
3296                        annotation_list = []
3297                        annotation_infos_rename_list = []
3298
3299                        for annotation_field in annotation_fields:
3300
3301                            # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
3302                            annotation_fields_new_name = annotation_fields.get(
3303                                annotation_field, annotation_field
3304                            )
3305                            if not annotation_fields_new_name:
3306                                annotation_fields_new_name = annotation_field
3307
3308                            # Check if field is in DB and if field is not elready in input data
3309                            if (
3310                                annotation_field in db_hdr_vcf.get_header().infos
3311                                and annotation_fields_new_name
3312                                not in self.get_header().infos
3313                            ):
3314
3315                                log.info(
3316                                    f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
3317                                )
3318
3319                                # BCFTools annotate param to rename fields
3320                                if annotation_field != annotation_fields_new_name:
3321                                    annotation_infos_rename_list.append(
3322                                        f"{annotation_fields_new_name}:=INFO/{annotation_field}"
3323                                    )
3324
3325                                # Add INFO field to header
3326                                db_hdr_vcf_header_infos_number = (
3327                                    db_hdr_vcf_header_infos[annotation_field].num or "."
3328                                )
3329                                db_hdr_vcf_header_infos_type = (
3330                                    db_hdr_vcf_header_infos[annotation_field].type
3331                                    or "String"
3332                                )
3333                                db_hdr_vcf_header_infos_description = (
3334                                    db_hdr_vcf_header_infos[annotation_field].desc
3335                                    or f"{annotation_field} description"
3336                                )
3337                                db_hdr_vcf_header_infos_source = (
3338                                    db_hdr_vcf_header_infos[annotation_field].source
3339                                    or "unknown"
3340                                )
3341                                db_hdr_vcf_header_infos_version = (
3342                                    db_hdr_vcf_header_infos[annotation_field].version
3343                                    or "unknown"
3344                                )
3345
3346                                vcf_reader.infos[annotation_fields_new_name] = (
3347                                    vcf.parser._Info(
3348                                        annotation_fields_new_name,
3349                                        db_hdr_vcf_header_infos_number,
3350                                        db_hdr_vcf_header_infos_type,
3351                                        db_hdr_vcf_header_infos_description,
3352                                        db_hdr_vcf_header_infos_source,
3353                                        db_hdr_vcf_header_infos_version,
3354                                        self.code_type_map[
3355                                            db_hdr_vcf_header_infos_type
3356                                        ],
3357                                    )
3358                                )
3359
3360                                annotation_list.append(annotation_field)
3361
3362                                nb_annotation_field += 1
3363
3364                            else:
3365
3366                                if (
3367                                    annotation_field
3368                                    not in db_hdr_vcf.get_header().infos
3369                                ):
3370                                    log.warning(
3371                                        f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file"
3372                                    )
3373                                if (
3374                                    annotation_fields_new_name
3375                                    in self.get_header().infos
3376                                ):
3377                                    log.warning(
3378                                        f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)"
3379                                    )
3380
3381                        log.info(
3382                            f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
3383                        )
3384
3385                        annotation_infos = ",".join(annotation_list)
3386
3387                        if annotation_infos != "":
3388
3389                            # Annotated VCF (and error file)
3390                            tmp_annotation_vcf_name = os.path.join(
3391                                tmp_dir, os.path.basename(annotation) + ".vcf.gz"
3392                            )
3393                            tmp_annotation_vcf_name_err = (
3394                                tmp_annotation_vcf_name + ".err"
3395                            )
3396
3397                            # Add fields to annotate
3398                            if not annotation_fields_full:
3399                                annotation_infos_option = f"-info {annotation_infos}"
3400                            else:
3401                                annotation_infos_option = ""
3402
3403                            # Info fields rename
3404                            if annotation_infos_rename_list:
3405                                annotation_infos_rename = " -c " + ",".join(
3406                                    annotation_infos_rename_list
3407                                )
3408                            else:
3409                                annotation_infos_rename = ""
3410
3411                            # Annotate command
3412                            command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
3413
3414                            # Add command
3415                            commands[command_annotate] = tmp_annotation_vcf_name
3416
3417                if commands:
3418
3419                    # Export VCF file
3420                    self.export_variant_vcf(
3421                        vcf_file=tmp_vcf_name,
3422                        remove_info=True,
3423                        add_samples=False,
3424                        index=True,
3425                    )
3426                    shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf")
3427
3428                    # Num command
3429                    nb_command = 0
3430
3431                    # Annotate
3432                    for command_annotate in commands:
3433                        nb_command += 1
3434                        log.info(
3435                            f"Annotation - Annotate [{nb_command}/{len(commands)}]..."
3436                        )
3437                        log.debug(f"command_annotate={command_annotate}")
3438                        run_parallel_commands([command_annotate], threads)
3439
3440                        # Debug
3441                        shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf")
3442
3443                        # Update variants
3444                        log.info(
3445                            f"Annotation - Updating [{nb_command}/{len(commands)}]..."
3446                        )
3447                        self.update_from_vcf(commands[command_annotate])

This function annotate with bcftools

Parameters
  • threads: Number of threads to use
Returns

the value of the variable "return_value".

def annotation_bcftools(self, threads: int = None) -> None:
3449    def annotation_bcftools(self, threads: int = None) -> None:
3450        """
3451        This function annotate with bcftools
3452
3453        :param threads: Number of threads to use
3454        :return: the value of the variable "return_value".
3455        """
3456
3457        # DEBUG
3458        log.debug("Start annotation with bcftools databases")
3459
3460        # Threads
3461        if not threads:
3462            threads = self.get_threads()
3463        log.debug("Threads: " + str(threads))
3464
3465        # Config
3466        config = self.get_config()
3467        log.debug("Config: " + str(config))
3468
3469        # DEBUG
3470        delete_tmp = True
3471        if self.get_config().get("verbosity", "warning") in ["debug"]:
3472            delete_tmp = False
3473            log.debug("Delete tmp files/folders: " + str(delete_tmp))
3474
3475        # Config - BCFTools bin command
3476        bcftools_bin_command = get_bin_command(
3477            bin="bcftools",
3478            tool="bcftools",
3479            bin_type="bin",
3480            config=config,
3481            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
3482        )
3483        if not bcftools_bin_command:
3484            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
3485            log.error(msg_err)
3486            raise ValueError(msg_err)
3487
3488        # Config - BCFTools databases folders
3489        databases_folders = set(
3490            self.get_config()
3491            .get("folders", {})
3492            .get("databases", {})
3493            .get("annotations", ["."])
3494            + self.get_config()
3495            .get("folders", {})
3496            .get("databases", {})
3497            .get("bcftools", ["."])
3498        )
3499        log.debug("Databases annotations: " + str(databases_folders))
3500
3501        # Param
3502        annotations = (
3503            self.get_param()
3504            .get("annotation", {})
3505            .get("bcftools", {})
3506            .get("annotations", None)
3507        )
3508        log.debug("Annotations: " + str(annotations))
3509
3510        # Assembly
3511        assembly = self.get_param().get(
3512            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
3513        )
3514
3515        # Data
3516        table_variants = self.get_table_variants()
3517
3518        # Check if not empty
3519        log.debug("Check if not empty")
3520        sql_query_chromosomes = (
3521            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
3522        )
3523        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
3524        if not sql_query_chromosomes_df["count"][0]:
3525            log.info(f"VCF empty")
3526            return
3527
3528        # Export in VCF
3529        log.debug("Create initial file to annotate")
3530        tmp_vcf = NamedTemporaryFile(
3531            prefix=self.get_prefix(),
3532            dir=self.get_tmp_dir(),
3533            suffix=".vcf.gz",
3534            delete=False,
3535        )
3536        tmp_vcf_name = tmp_vcf.name
3537
3538        # VCF header
3539        vcf_reader = self.get_header()
3540        log.debug("Initial header: " + str(vcf_reader.infos))
3541
3542        # Existing annotations
3543        for vcf_annotation in self.get_header().infos:
3544
3545            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
3546            log.debug(
3547                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
3548            )
3549
3550        if annotations:
3551
3552            tmp_ann_vcf_list = []
3553            commands = []
3554            tmp_files = []
3555            err_files = []
3556
3557            for annotation in annotations:
3558                annotation_fields = annotations[annotation]
3559
3560                # Annotation Name
3561                annotation_name = os.path.basename(annotation)
3562
3563                if not annotation_fields:
3564                    annotation_fields = {"INFO": None}
3565
3566                log.debug(f"Annotation '{annotation_name}'")
3567                log.debug(
3568                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
3569                )
3570
3571                # Create Database
3572                database = Database(
3573                    database=annotation,
3574                    databases_folders=databases_folders,
3575                    assembly=assembly,
3576                )
3577
3578                # Find files
3579                db_file = database.get_database()
3580                db_file = full_path(db_file)
3581                db_hdr_file = database.get_header_file()
3582                db_hdr_file = full_path(db_hdr_file)
3583                db_file_type = database.get_format()
3584                db_tbi_file = f"{db_file}.tbi"
3585                db_file_compressed = database.is_compressed()
3586
3587                # Check if compressed
3588                if not db_file_compressed:
3589                    log.error(
3590                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
3591                    )
3592                    raise ValueError(
3593                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
3594                    )
3595
3596                # Check if indexed
3597                if not os.path.exists(db_tbi_file):
3598                    log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file")
3599                    raise ValueError(
3600                        f"Annotation '{annotation}' - {db_file} NOT indexed file"
3601                    )
3602
3603                # Check index - try to create if not exists
3604                if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
3605                    log.error("Annotation failed: database not valid")
3606                    log.error(f"Annotation annotation file: {db_file}")
3607                    log.error(f"Annotation annotation header: {db_hdr_file}")
3608                    log.error(f"Annotation annotation index: {db_tbi_file}")
3609                    raise ValueError(
3610                        f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
3611                    )
3612                else:
3613
3614                    log.debug(
3615                        f"Annotation '{annotation}' - file: "
3616                        + str(db_file)
3617                        + " and "
3618                        + str(db_hdr_file)
3619                    )
3620
3621                    # Load header as VCF object
3622                    db_hdr_vcf = Variants(input=db_hdr_file)
3623                    db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
3624                    log.debug(
3625                        "Annotation database header: " + str(db_hdr_vcf_header_infos)
3626                    )
3627
3628                    # For all fields in database
3629                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
3630                        annotation_fields = {
3631                            key: key for key in db_hdr_vcf_header_infos
3632                        }
3633                        log.debug(
3634                            "Annotation database header - All annotations added: "
3635                            + str(annotation_fields)
3636                        )
3637
3638                    # Number of fields
3639                    nb_annotation_field = 0
3640                    annotation_list = []
3641
3642                    for annotation_field in annotation_fields:
3643
3644                        # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
3645                        annotation_fields_new_name = annotation_fields.get(
3646                            annotation_field, annotation_field
3647                        )
3648                        if not annotation_fields_new_name:
3649                            annotation_fields_new_name = annotation_field
3650
3651                        # Check if field is in DB and if field is not elready in input data
3652                        if (
3653                            annotation_field in db_hdr_vcf.get_header().infos
3654                            and annotation_fields_new_name
3655                            not in self.get_header().infos
3656                        ):
3657
3658                            log.info(
3659                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
3660                            )
3661
3662                            # Add INFO field to header
3663                            db_hdr_vcf_header_infos_number = (
3664                                db_hdr_vcf_header_infos[annotation_field].num or "."
3665                            )
3666                            db_hdr_vcf_header_infos_type = (
3667                                db_hdr_vcf_header_infos[annotation_field].type
3668                                or "String"
3669                            )
3670                            db_hdr_vcf_header_infos_description = (
3671                                db_hdr_vcf_header_infos[annotation_field].desc
3672                                or f"{annotation_field} description"
3673                            )
3674                            db_hdr_vcf_header_infos_source = (
3675                                db_hdr_vcf_header_infos[annotation_field].source
3676                                or "unknown"
3677                            )
3678                            db_hdr_vcf_header_infos_version = (
3679                                db_hdr_vcf_header_infos[annotation_field].version
3680                                or "unknown"
3681                            )
3682
3683                            vcf_reader.infos[annotation_fields_new_name] = (
3684                                vcf.parser._Info(
3685                                    annotation_fields_new_name,
3686                                    db_hdr_vcf_header_infos_number,
3687                                    db_hdr_vcf_header_infos_type,
3688                                    db_hdr_vcf_header_infos_description,
3689                                    db_hdr_vcf_header_infos_source,
3690                                    db_hdr_vcf_header_infos_version,
3691                                    self.code_type_map[db_hdr_vcf_header_infos_type],
3692                                )
3693                            )
3694
3695                            # annotation_list.append(annotation_field)
3696                            if annotation_field != annotation_fields_new_name:
3697                                annotation_list.append(
3698                                    f"{annotation_fields_new_name}:=INFO/{annotation_field}"
3699                                )
3700                            else:
3701                                annotation_list.append(annotation_field)
3702
3703                            nb_annotation_field += 1
3704
3705                        else:
3706
3707                            if annotation_field not in db_hdr_vcf.get_header().infos:
3708                                log.warning(
3709                                    f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file"
3710                                )
3711                            if annotation_fields_new_name in self.get_header().infos:
3712                                log.warning(
3713                                    f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
3714                                )
3715
3716                    log.info(
3717                        f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
3718                    )
3719
3720                    annotation_infos = ",".join(annotation_list)
3721
3722                    if annotation_infos != "":
3723
3724                        # Protect header for bcftools (remove "#CHROM" and variants line)
3725                        log.debug("Protect Header file - remove #CHROM line if exists")
3726                        tmp_header_vcf = NamedTemporaryFile(
3727                            prefix=self.get_prefix(),
3728                            dir=self.get_tmp_dir(),
3729                            suffix=".hdr",
3730                            delete=False,
3731                        )
3732                        tmp_header_vcf_name = tmp_header_vcf.name
3733                        tmp_files.append(tmp_header_vcf_name)
3734                        # Command
3735                        if db_hdr_file.endswith(".gz"):
3736                            command_extract_header = f"zcat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
3737                        else:
3738                            command_extract_header = f"cat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
3739                        # Run
3740                        run_parallel_commands([command_extract_header], 1)
3741
3742                        # Find chomosomes
3743                        log.debug("Find chromosomes ")
3744                        sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\""""
3745                        sql_query_chromosomes_df = self.get_query_to_df(
3746                            sql_query_chromosomes
3747                        )
3748                        chomosomes_list = list(sql_query_chromosomes_df["CHROM"])
3749
3750                        log.debug("Chromosomes found: " + str(list(chomosomes_list)))
3751
3752                        # BED columns in the annotation file
3753                        if db_file_type in ["bed"]:
3754                            annotation_infos = "CHROM,POS,POS," + annotation_infos
3755
3756                        for chrom in chomosomes_list:
3757
3758                            # Create BED on initial VCF
3759                            log.debug("Create BED on initial VCF: " + str(tmp_vcf_name))
3760                            tmp_bed = NamedTemporaryFile(
3761                                prefix=self.get_prefix(),
3762                                dir=self.get_tmp_dir(),
3763                                suffix=".bed",
3764                                delete=False,
3765                            )
3766                            tmp_bed_name = tmp_bed.name
3767                            tmp_files.append(tmp_bed_name)
3768
3769                            # Detecte regions
3770                            log.debug(
3771                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..."
3772                            )
3773                            window = 1000000
3774                            sql_query_intervals_for_bed = f"""
3775                                SELECT  \"#CHROM\",
3776                                        CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END,
3777                                        \"POS\"+{window}
3778                                FROM {table_variants} as table_variants
3779                                WHERE table_variants.\"#CHROM\" = '{chrom}'
3780                            """
3781                            regions = self.conn.execute(
3782                                sql_query_intervals_for_bed
3783                            ).fetchall()
3784                            merged_regions = merge_regions(regions)
3785                            log.debug(
3786                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..."
3787                            )
3788
3789                            header = ["#CHROM", "START", "END"]
3790                            with open(tmp_bed_name, "w") as f:
3791                                # Write the header with tab delimiter
3792                                f.write("\t".join(header) + "\n")
3793                                for d in merged_regions:
3794                                    # Write each data row with tab delimiter
3795                                    f.write("\t".join(map(str, d)) + "\n")
3796
3797                            # Tmp files
3798                            tmp_annotation_vcf = NamedTemporaryFile(
3799                                prefix=self.get_prefix(),
3800                                dir=self.get_tmp_dir(),
3801                                suffix=".vcf.gz",
3802                                delete=False,
3803                            )
3804                            tmp_annotation_vcf_name = tmp_annotation_vcf.name
3805                            tmp_files.append(tmp_annotation_vcf_name)
3806                            tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}")
3807                            tmp_annotation_vcf_name_err = (
3808                                tmp_annotation_vcf_name + ".err"
3809                            )
3810                            err_files.append(tmp_annotation_vcf_name_err)
3811
3812                            # Annotate Command
3813                            log.debug(
3814                                f"Annotation '{annotation}' - add bcftools command"
3815                            )
3816
3817                            # Command
3818                            command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
3819
3820                            # Add command
3821                            commands.append(command_annotate)
3822
3823            # if some commands
3824            if commands:
3825
3826                # Export VCF file
3827                self.export_variant_vcf(
3828                    vcf_file=tmp_vcf_name,
3829                    remove_info=True,
3830                    add_samples=False,
3831                    index=True,
3832                )
3833
3834                # Threads
3835                # calculate threads for annotated commands
3836                if commands:
3837                    threads_bcftools_annotate = round(threads / len(commands))
3838                else:
3839                    threads_bcftools_annotate = 1
3840
3841                if not threads_bcftools_annotate:
3842                    threads_bcftools_annotate = 1
3843
3844                # Add threads option to bcftools commands
3845                if threads_bcftools_annotate > 1:
3846                    commands_threaded = []
3847                    for command in commands:
3848                        commands_threaded.append(
3849                            command.replace(
3850                                f"{bcftools_bin_command} annotate ",
3851                                f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ",
3852                            )
3853                        )
3854                    commands = commands_threaded
3855
3856                # Command annotation multithreading
3857                log.debug(f"Annotation - Annotation commands: " + str(commands))
3858                log.info(
3859                    f"Annotation - Annotation multithreaded in "
3860                    + str(len(commands))
3861                    + " commands"
3862                )
3863
3864                run_parallel_commands(commands, threads)
3865
3866                # Merge
3867                tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list)
3868
3869                if tmp_ann_vcf_list_cmd:
3870
3871                    # Tmp file
3872                    tmp_annotate_vcf = NamedTemporaryFile(
3873                        prefix=self.get_prefix(),
3874                        dir=self.get_tmp_dir(),
3875                        suffix=".vcf.gz",
3876                        delete=True,
3877                    )
3878                    tmp_annotate_vcf_name = tmp_annotate_vcf.name
3879                    tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
3880                    err_files.append(tmp_annotate_vcf_name_err)
3881
3882                    # Tmp file remove command
3883                    tmp_files_remove_command = ""
3884                    if tmp_files:
3885                        tmp_files_remove_command = " && rm -f " + " ".join(tmp_files)
3886
3887                    # Command merge
3888                    merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}"
3889                    log.info(
3890                        f"Annotation - Annotation merging "
3891                        + str(len(commands))
3892                        + " annotated files"
3893                    )
3894                    log.debug(f"Annotation - merge command: {merge_command}")
3895                    run_parallel_commands([merge_command], 1)
3896
3897                    # Error messages
3898                    log.info(f"Error/Warning messages:")
3899                    error_message_command_all = []
3900                    error_message_command_warning = []
3901                    error_message_command_err = []
3902                    for err_file in err_files:
3903                        with open(err_file, "r") as f:
3904                            for line in f:
3905                                message = line.strip()
3906                                error_message_command_all.append(message)
3907                                if line.startswith("[W::"):
3908                                    error_message_command_warning.append(message)
3909                                if line.startswith("[E::"):
3910                                    error_message_command_err.append(
3911                                        f"{err_file}: " + message
3912                                    )
3913                    # log info
3914                    for message in list(
3915                        set(error_message_command_err + error_message_command_warning)
3916                    ):
3917                        log.info(f"   {message}")
3918                    # debug info
3919                    for message in list(set(error_message_command_all)):
3920                        log.debug(f"   {message}")
3921                    # failed
3922                    if len(error_message_command_err):
3923                        log.error("Annotation failed: Error in commands")
3924                        raise ValueError("Annotation failed: Error in commands")
3925
3926                    # Update variants
3927                    log.info(f"Annotation - Updating...")
3928                    self.update_from_vcf(tmp_annotate_vcf_name)

This function annotate with bcftools

Parameters
  • threads: Number of threads to use
Returns

the value of the variable "return_value".

def annotation_exomiser(self, threads: int = None) -> None:
3930    def annotation_exomiser(self, threads: int = None) -> None:
3931        """
3932        This function annotate with Exomiser
3933
3934        This function uses args as parameters, in section "annotation" -> "exomiser", with sections:
3935        - "analysis" (dict/file):
3936            Full analysis dictionnary parameters (see Exomiser docs).
3937            Either a dict, or a file in JSON or YAML format.
3938            These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO)
3939            Default : None
3940        - "preset" (string):
3941            Analysis preset (available in config folder).
3942            Used if no full "analysis" is provided.
3943            Default: "exome"
3944        - "phenopacket" (dict/file):
3945            Samples and phenotipic features parameters (see Exomiser docs).
3946            Either a dict, or a file in JSON or YAML format.
3947            Default: None
3948        - "subject" (dict):
3949            Sample parameters (see Exomiser docs).
3950            Example:
3951                "subject":
3952                    {
3953                        "id": "ISDBM322017",
3954                        "sex": "FEMALE"
3955                    }
3956            Default: None
3957        - "sample" (string):
3958            Sample name to construct "subject" section:
3959                "subject":
3960                    {
3961                        "id": "<sample>",
3962                        "sex": "UNKNOWN_SEX"
3963                    }
3964            Default: None
3965        - "phenotypicFeatures" (dict)
3966            Phenotypic features to construct "subject" section.
3967            Example:
3968                "phenotypicFeatures":
3969                    [
3970                        { "type": { "id": "HP:0001159", "label": "Syndactyly" } },
3971                        { "type": { "id": "HP:0000486", "label": "Strabismus" } }
3972                    ]
3973        - "hpo" (list)
3974            List of HPO ids as phenotypic features.
3975            Example:
3976                "hpo": ['0001156', '0001363', '0011304', '0010055']
3977            Default: []
3978        - "outputOptions" (dict):
3979            Output options (see Exomiser docs).
3980            Default:
3981                "output_options" =
3982                    {
3983                        "outputContributingVariantsOnly": False,
3984                        "numGenes": 0,
3985                        "outputFormats": ["TSV_VARIANT", "VCF"]
3986                    }
3987        - "transcript_source" (string):
3988            Transcript source (either "refseq", "ucsc", "ensembl")
3989            Default: "refseq"
3990        - "exomiser_to_info" (boolean):
3991            Add exomiser TSV file columns as INFO fields in VCF.
3992            Default: False
3993        - "release" (string):
3994            Exomise database release.
3995            If not exists, database release will be downloaded (take a while).
3996            Default: None (provided by application.properties configuration file)
3997        - "exomiser_application_properties" (file):
3998            Exomiser configuration file (see Exomiser docs).
3999            Useful to automatically download databases (especially for specific genome databases).
4000
4001        Notes:
4002        - If no sample in parameters, first sample in VCF will be chosen
4003        - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
4004
4005        :param threads: The number of threads to use
4006        :return: None.
4007        """
4008
4009        # DEBUG
4010        log.debug("Start annotation with Exomiser databases")
4011
4012        # Threads
4013        if not threads:
4014            threads = self.get_threads()
4015        log.debug("Threads: " + str(threads))
4016
4017        # Config
4018        config = self.get_config()
4019        log.debug("Config: " + str(config))
4020
4021        # Config - Folders - Databases
4022        databases_folders = (
4023            config.get("folders", {})
4024            .get("databases", {})
4025            .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current")
4026        )
4027        databases_folders = full_path(databases_folders)
4028        if not os.path.exists(databases_folders):
4029            log.error(f"Databases annotations: {databases_folders} NOT found")
4030        log.debug("Databases annotations: " + str(databases_folders))
4031
4032        # Config - Exomiser
4033        exomiser_bin_command = get_bin_command(
4034            bin="exomiser-cli*.jar",
4035            tool="exomiser",
4036            bin_type="jar",
4037            config=config,
4038            default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser",
4039        )
4040        log.debug("Exomiser bin command: " + str(exomiser_bin_command))
4041        if not exomiser_bin_command:
4042            msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'"
4043            log.error(msg_err)
4044            raise ValueError(msg_err)
4045
4046        # Param
4047        param = self.get_param()
4048        log.debug("Param: " + str(param))
4049
4050        # Param - Exomiser
4051        param_exomiser = param.get("annotation", {}).get("exomiser", {})
4052        log.debug(f"Param Exomiser: {param_exomiser}")
4053
4054        # Param - Assembly
4055        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
4056        log.debug("Assembly: " + str(assembly))
4057
4058        # Data
4059        table_variants = self.get_table_variants()
4060
4061        # Check if not empty
4062        log.debug("Check if not empty")
4063        sql_query_chromosomes = (
4064            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
4065        )
4066        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
4067            log.info(f"VCF empty")
4068            return False
4069
4070        # VCF header
4071        vcf_reader = self.get_header()
4072        log.debug("Initial header: " + str(vcf_reader.infos))
4073
4074        # Samples
4075        samples = self.get_header_sample_list()
4076        if not samples:
4077            log.error("No Samples in VCF")
4078            return False
4079        log.debug(f"Samples: {samples}")
4080
4081        # Memory limit
4082        memory_limit = self.get_memory("8G")
4083        log.debug(f"memory_limit: {memory_limit}")
4084
4085        # Exomiser java options
4086        exomiser_java_options = (
4087            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
4088        )
4089        log.debug(f"Exomiser java options: {exomiser_java_options}")
4090
4091        # Download Exomiser (if not exists)
4092        exomiser_release = param_exomiser.get("release", None)
4093        exomiser_application_properties = param_exomiser.get(
4094            "exomiser_application_properties", None
4095        )
4096        databases_download_exomiser(
4097            assemblies=[assembly],
4098            exomiser_folder=databases_folders,
4099            exomiser_release=exomiser_release,
4100            exomiser_phenotype_release=exomiser_release,
4101            exomiser_application_properties=exomiser_application_properties,
4102        )
4103
4104        # Force annotation
4105        force_update_annotation = True
4106
4107        if "Exomiser" not in self.get_header().infos or force_update_annotation:
4108            log.debug("Start annotation Exomiser")
4109
4110            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
4111
4112                # tmp_dir = "/tmp/exomiser"
4113
4114                ### ANALYSIS ###
4115                ################
4116
4117                # Create analysis.json through analysis dict
4118                # either analysis in param or by default
4119                # depending on preset exome/genome)
4120
4121                # Init analysis dict
4122                param_exomiser_analysis_dict = {}
4123
4124                # analysis from param
4125                param_exomiser_analysis = param_exomiser.get("analysis", {})
4126                param_exomiser_analysis = full_path(param_exomiser_analysis)
4127
4128                # If analysis in param -> load anlaysis json
4129                if param_exomiser_analysis:
4130
4131                    # If param analysis is a file and exists
4132                    if isinstance(param_exomiser_analysis, str) and os.path.exists(
4133                        param_exomiser_analysis
4134                    ):
4135                        # Load analysis file into analysis dict (either yaml or json)
4136                        with open(param_exomiser_analysis) as json_file:
4137                            param_exomiser_analysis_dict = yaml.safe_load(json_file)
4138
4139                    # If param analysis is a dict
4140                    elif isinstance(param_exomiser_analysis, dict):
4141                        # Load analysis dict into analysis dict (either yaml or json)
4142                        param_exomiser_analysis_dict = param_exomiser_analysis
4143
4144                    # Error analysis type
4145                    else:
4146                        log.error(f"Analysis type unknown. Check param file.")
4147                        raise ValueError(f"Analysis type unknown. Check param file.")
4148
4149                # Case no input analysis config file/dict
4150                # Use preset (exome/genome) to open default config file
4151                if not param_exomiser_analysis_dict:
4152
4153                    # default preset
4154                    default_preset = "exome"
4155
4156                    # Get param preset or default preset
4157                    param_exomiser_preset = param_exomiser.get("preset", default_preset)
4158
4159                    # Try to find if preset is a file
4160                    if os.path.exists(param_exomiser_preset):
4161                        # Preset file is provided in full path
4162                        param_exomiser_analysis_default_config_file = (
4163                            param_exomiser_preset
4164                        )
4165                    # elif os.path.exists(full_path(param_exomiser_preset)):
4166                    #     # Preset file is provided in full path
4167                    #     param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset)
4168                    elif os.path.exists(
4169                        os.path.join(folder_config, param_exomiser_preset)
4170                    ):
4171                        # Preset file is provided a basename in config folder (can be a path with subfolders)
4172                        param_exomiser_analysis_default_config_file = os.path.join(
4173                            folder_config, param_exomiser_preset
4174                        )
4175                    else:
4176                        # Construct preset file
4177                        param_exomiser_analysis_default_config_file = os.path.join(
4178                            folder_config,
4179                            f"preset-{param_exomiser_preset}-analysis.json",
4180                        )
4181
4182                    # If preset file exists
4183                    param_exomiser_analysis_default_config_file = full_path(
4184                        param_exomiser_analysis_default_config_file
4185                    )
4186                    if os.path.exists(param_exomiser_analysis_default_config_file):
4187                        # Load prest file into analysis dict (either yaml or json)
4188                        with open(
4189                            param_exomiser_analysis_default_config_file
4190                        ) as json_file:
4191                            # param_exomiser_analysis_dict[""] = json.load(json_file)
4192                            param_exomiser_analysis_dict["analysis"] = yaml.safe_load(
4193                                json_file
4194                            )
4195
4196                    # Error preset file
4197                    else:
4198                        log.error(
4199                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
4200                        )
4201                        raise ValueError(
4202                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
4203                        )
4204
4205                # If no analysis dict created
4206                if not param_exomiser_analysis_dict:
4207                    log.error(f"No analysis config")
4208                    raise ValueError(f"No analysis config")
4209
4210                # Log
4211                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
4212
4213                ### PHENOPACKET ###
4214                ###################
4215
4216                # If no PhenoPacket in analysis dict -> check in param
4217                if "phenopacket" not in param_exomiser_analysis_dict:
4218
4219                    # If PhenoPacket in param -> load anlaysis json
4220                    if param_exomiser.get("phenopacket", None):
4221
4222                        param_exomiser_phenopacket = param_exomiser.get("phenopacket")
4223                        param_exomiser_phenopacket = full_path(
4224                            param_exomiser_phenopacket
4225                        )
4226
4227                        # If param phenopacket is a file and exists
4228                        if isinstance(
4229                            param_exomiser_phenopacket, str
4230                        ) and os.path.exists(param_exomiser_phenopacket):
4231                            # Load phenopacket file into analysis dict (either yaml or json)
4232                            with open(param_exomiser_phenopacket) as json_file:
4233                                param_exomiser_analysis_dict["phenopacket"] = (
4234                                    yaml.safe_load(json_file)
4235                                )
4236
4237                        # If param phenopacket is a dict
4238                        elif isinstance(param_exomiser_phenopacket, dict):
4239                            # Load phenopacket dict into analysis dict (either yaml or json)
4240                            param_exomiser_analysis_dict["phenopacket"] = (
4241                                param_exomiser_phenopacket
4242                            )
4243
4244                        # Error phenopacket type
4245                        else:
4246                            log.error(f"Phenopacket type unknown. Check param file.")
4247                            raise ValueError(
4248                                f"Phenopacket type unknown. Check param file."
4249                            )
4250
4251                # If no PhenoPacket in analysis dict -> construct from sample and HPO in param
4252                if "phenopacket" not in param_exomiser_analysis_dict:
4253
4254                    # Init PhenoPacket
4255                    param_exomiser_analysis_dict["phenopacket"] = {
4256                        "id": "analysis",
4257                        "proband": {},
4258                    }
4259
4260                    ### Add subject ###
4261
4262                    # If subject exists
4263                    param_exomiser_subject = param_exomiser.get("subject", {})
4264
4265                    # If subject not exists -> found sample ID
4266                    if not param_exomiser_subject:
4267
4268                        # Found sample ID in param
4269                        sample = param_exomiser.get("sample", None)
4270
4271                        # Find sample ID (first sample)
4272                        if not sample:
4273                            sample_list = self.get_header_sample_list()
4274                            if len(sample_list) > 0:
4275                                sample = sample_list[0]
4276                            else:
4277                                log.error(f"No sample found")
4278                                raise ValueError(f"No sample found")
4279
4280                        # Create subject
4281                        param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"}
4282
4283                    # Add to dict
4284                    param_exomiser_analysis_dict["phenopacket"][
4285                        "subject"
4286                    ] = param_exomiser_subject
4287
4288                    ### Add "phenotypicFeatures" ###
4289
4290                    # If phenotypicFeatures exists
4291                    param_exomiser_phenotypicfeatures = param_exomiser.get(
4292                        "phenotypicFeatures", []
4293                    )
4294
4295                    # If phenotypicFeatures not exists -> Try to infer from hpo list
4296                    if not param_exomiser_phenotypicfeatures:
4297
4298                        # Found HPO in param
4299                        param_exomiser_hpo = param_exomiser.get("hpo", [])
4300
4301                        # Split HPO if list in string format separated by comma
4302                        if isinstance(param_exomiser_hpo, str):
4303                            param_exomiser_hpo = param_exomiser_hpo.split(",")
4304
4305                        # Create HPO list
4306                        for hpo in param_exomiser_hpo:
4307                            hpo_clean = re.sub("[^0-9]", "", hpo)
4308                            param_exomiser_phenotypicfeatures.append(
4309                                {
4310                                    "type": {
4311                                        "id": f"HP:{hpo_clean}",
4312                                        "label": f"HP:{hpo_clean}",
4313                                    }
4314                                }
4315                            )
4316
4317                    # Add to dict
4318                    param_exomiser_analysis_dict["phenopacket"][
4319                        "phenotypicFeatures"
4320                    ] = param_exomiser_phenotypicfeatures
4321
4322                    # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step
4323                    if not param_exomiser_phenotypicfeatures:
4324                        for step in param_exomiser_analysis_dict.get(
4325                            "analysis", {}
4326                        ).get("steps", []):
4327                            if "hiPhivePrioritiser" in step:
4328                                param_exomiser_analysis_dict.get("analysis", {}).get(
4329                                    "steps", []
4330                                ).remove(step)
4331
4332                ### Add Input File ###
4333
4334                # Initial file name and htsFiles
4335                tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz")
4336                param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [
4337                    {
4338                        "uri": tmp_vcf_name,
4339                        "htsFormat": "VCF",
4340                        "genomeAssembly": assembly,
4341                    }
4342                ]
4343
4344                ### Add metaData ###
4345
4346                # If metaData not in analysis dict
4347                if "metaData" not in param_exomiser_analysis_dict:
4348                    param_exomiser_analysis_dict["phenopacket"]["metaData"] = {
4349                        "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z",
4350                        "createdBy": "howard",
4351                        "phenopacketSchemaVersion": 1,
4352                    }
4353
4354                ### OutputOptions ###
4355
4356                # Init output result folder
4357                output_results = os.path.join(tmp_dir, "results")
4358
4359                # If no outputOptions in analysis dict
4360                if "outputOptions" not in param_exomiser_analysis_dict:
4361
4362                    # default output formats
4363                    defaut_output_formats = ["TSV_VARIANT", "VCF"]
4364
4365                    # Get outputOptions in param
4366                    output_options = param_exomiser.get("outputOptions", None)
4367
4368                    # If no output_options in param -> check
4369                    if not output_options:
4370                        output_options = {
4371                            "outputContributingVariantsOnly": False,
4372                            "numGenes": 0,
4373                            "outputFormats": defaut_output_formats,
4374                        }
4375
4376                    # Replace outputDirectory in output options
4377                    output_options["outputDirectory"] = output_results
4378                    output_options["outputFileName"] = "howard"
4379
4380                    # Add outputOptions in analysis dict
4381                    param_exomiser_analysis_dict["outputOptions"] = output_options
4382
4383                else:
4384
4385                    # Replace output_results and output format (if exists in param)
4386                    param_exomiser_analysis_dict["outputOptions"][
4387                        "outputDirectory"
4388                    ] = output_results
4389                    param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = (
4390                        list(
4391                            set(
4392                                param_exomiser_analysis_dict.get(
4393                                    "outputOptions", {}
4394                                ).get("outputFormats", [])
4395                                + ["TSV_VARIANT", "VCF"]
4396                            )
4397                        )
4398                    )
4399
4400                # log
4401                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
4402
4403                ### ANALYSIS FILE ###
4404                #####################
4405
4406                ### Full JSON analysis config file ###
4407
4408                exomiser_analysis = os.path.join(tmp_dir, "analysis.json")
4409                with open(exomiser_analysis, "w") as fp:
4410                    json.dump(param_exomiser_analysis_dict, fp, indent=4)
4411
4412                ### SPLIT analysis and sample config files
4413
4414                # Splitted analysis dict
4415                param_exomiser_analysis_dict_for_split = (
4416                    param_exomiser_analysis_dict.copy()
4417                )
4418
4419                # Phenopacket JSON file
4420                exomiser_analysis_phenopacket = os.path.join(
4421                    tmp_dir, "analysis_phenopacket.json"
4422                )
4423                with open(exomiser_analysis_phenopacket, "w") as fp:
4424                    json.dump(
4425                        param_exomiser_analysis_dict_for_split.get("phenopacket"),
4426                        fp,
4427                        indent=4,
4428                    )
4429
4430                # Analysis JSON file without Phenopacket parameters
4431                param_exomiser_analysis_dict_for_split.pop("phenopacket")
4432                exomiser_analysis_analysis = os.path.join(
4433                    tmp_dir, "analysis_analysis.json"
4434                )
4435                with open(exomiser_analysis_analysis, "w") as fp:
4436                    json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4)
4437
4438                ### INITAL VCF file ###
4439                #######################
4440
4441                ### Create list of samples to use and include inti initial VCF file ####
4442
4443                # Subject (main sample)
4444                # Get sample ID in analysis dict
4445                sample_subject = (
4446                    param_exomiser_analysis_dict.get("phenopacket", {})
4447                    .get("subject", {})
4448                    .get("id", None)
4449                )
4450                sample_proband = (
4451                    param_exomiser_analysis_dict.get("phenopacket", {})
4452                    .get("proband", {})
4453                    .get("subject", {})
4454                    .get("id", None)
4455                )
4456                sample = []
4457                if sample_subject:
4458                    sample.append(sample_subject)
4459                if sample_proband:
4460                    sample.append(sample_proband)
4461
4462                # Get sample ID within Pedigree
4463                pedigree_persons_list = (
4464                    param_exomiser_analysis_dict.get("phenopacket", {})
4465                    .get("pedigree", {})
4466                    .get("persons", {})
4467                )
4468
4469                # Create list with all sample ID in pedigree (if exists)
4470                pedigree_persons = []
4471                for person in pedigree_persons_list:
4472                    pedigree_persons.append(person.get("individualId"))
4473
4474                # Concat subject sample ID and samples ID in pedigreesamples
4475                samples = list(set(sample + pedigree_persons))
4476
4477                # Check if sample list is not empty
4478                if not samples:
4479                    log.error(f"No samples found")
4480                    raise ValueError(f"No samples found")
4481
4482                # Create VCF with sample (either sample in param or first one by default)
4483                # Export VCF file
4484                self.export_variant_vcf(
4485                    vcf_file=tmp_vcf_name,
4486                    remove_info=True,
4487                    add_samples=True,
4488                    list_samples=samples,
4489                    index=False,
4490                )
4491
4492                ### Execute Exomiser ###
4493                ########################
4494
4495                # Init command
4496                exomiser_command = ""
4497
4498                # Command exomiser options
4499                exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} "
4500
4501                # Release
4502                exomiser_release = param_exomiser.get("release", None)
4503                if exomiser_release:
4504                    # phenotype data version
4505                    exomiser_options += (
4506                        f" --exomiser.phenotype.data-version={exomiser_release} "
4507                    )
4508                    # data version
4509                    exomiser_options += (
4510                        f" --exomiser.{assembly}.data-version={exomiser_release} "
4511                    )
4512                    # variant white list
4513                    variant_white_list_file = (
4514                        f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz"
4515                    )
4516                    if os.path.exists(
4517                        os.path.join(
4518                            databases_folders, assembly, variant_white_list_file
4519                        )
4520                    ):
4521                        exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} "
4522
4523                # transcript_source
4524                transcript_source = param_exomiser.get(
4525                    "transcript_source", None
4526                )  # ucsc, refseq, ensembl
4527                if transcript_source:
4528                    exomiser_options += (
4529                        f" --exomiser.{assembly}.transcript-source={transcript_source} "
4530                    )
4531
4532                # If analysis contain proband param
4533                if param_exomiser_analysis_dict.get("phenopacket", {}).get(
4534                    "proband", {}
4535                ):
4536                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} "
4537
4538                # If no proband (usually uniq sample)
4539                else:
4540                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}"
4541
4542                # Log
4543                log.debug(f"exomiser_command_analysis={exomiser_command_analysis}")
4544
4545                # Run command
4546                result = subprocess.call(
4547                    exomiser_command_analysis.split(), stdout=subprocess.PIPE
4548                )
4549                if result:
4550                    log.error("Exomiser command failed")
4551                    raise ValueError("Exomiser command failed")
4552
4553                ### RESULTS ###
4554                ###############
4555
4556                ### Annotate with TSV fields ###
4557
4558                # Init result tsv file
4559                exomiser_to_info = param_exomiser.get("exomiser_to_info", False)
4560
4561                # Init result tsv file
4562                output_results_tsv = os.path.join(output_results, "howard.variants.tsv")
4563
4564                # Parse TSV file and explode columns in INFO field
4565                if exomiser_to_info and os.path.exists(output_results_tsv):
4566
4567                    # Log
4568                    log.debug("Exomiser columns to VCF INFO field")
4569
4570                    # Retrieve columns and types
4571                    query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """
4572                    output_results_tsv_df = self.get_query_to_df(query)
4573                    output_results_tsv_columns = output_results_tsv_df.columns.tolist()
4574
4575                    # Init concat fields for update
4576                    sql_query_update_concat_fields = []
4577
4578                    # Fields to avoid
4579                    fields_to_avoid = [
4580                        "CONTIG",
4581                        "START",
4582                        "END",
4583                        "REF",
4584                        "ALT",
4585                        "QUAL",
4586                        "FILTER",
4587                        "GENOTYPE",
4588                    ]
4589
4590                    # List all columns to add into header
4591                    for header_column in output_results_tsv_columns:
4592
4593                        # If header column is enable
4594                        if header_column not in fields_to_avoid:
4595
4596                            # Header info type
4597                            header_info_type = "String"
4598                            header_column_df = output_results_tsv_df[header_column]
4599                            header_column_df_dtype = header_column_df.dtype
4600                            if header_column_df_dtype == object:
4601                                if (
4602                                    pd.to_numeric(header_column_df, errors="coerce")
4603                                    .notnull()
4604                                    .all()
4605                                ):
4606                                    header_info_type = "Float"
4607                            else:
4608                                header_info_type = "Integer"
4609
4610                            # Header info
4611                            characters_to_validate = ["-"]
4612                            pattern = "[" + "".join(characters_to_validate) + "]"
4613                            header_info_name = re.sub(
4614                                pattern,
4615                                "_",
4616                                f"Exomiser_{header_column}".replace("#", ""),
4617                            )
4618                            header_info_number = "."
4619                            header_info_description = (
4620                                f"Exomiser {header_column} annotation"
4621                            )
4622                            header_info_source = "Exomiser"
4623                            header_info_version = "unknown"
4624                            header_info_code = CODE_TYPE_MAP[header_info_type]
4625                            vcf_reader.infos[header_info_name] = vcf.parser._Info(
4626                                header_info_name,
4627                                header_info_number,
4628                                header_info_type,
4629                                header_info_description,
4630                                header_info_source,
4631                                header_info_version,
4632                                header_info_code,
4633                            )
4634
4635                            # Add field to add for update to concat fields
4636                            sql_query_update_concat_fields.append(
4637                                f"""
4638                                CASE
4639                                    WHEN table_parquet."{header_column}" NOT IN ('','.')
4640                                    THEN concat(
4641                                        '{header_info_name}=',
4642                                        table_parquet."{header_column}",
4643                                        ';'
4644                                        )
4645
4646                                    ELSE ''
4647                                END
4648                            """
4649                            )
4650
4651                    # Update query
4652                    sql_query_update = f"""
4653                        UPDATE {table_variants} as table_variants
4654                            SET INFO = concat(
4655                                            CASE
4656                                                WHEN INFO NOT IN ('', '.')
4657                                                THEN INFO
4658                                                ELSE ''
4659                                            END,
4660                                            CASE
4661                                                WHEN table_variants.INFO NOT IN ('','.')
4662                                                THEN ';'
4663                                                ELSE ''
4664                                            END,
4665                                            (
4666                                            SELECT 
4667                                                concat(
4668                                                    {",".join(sql_query_update_concat_fields)}
4669                                                )
4670                                            FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet
4671                                                    WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\"
4672                                                    AND table_parquet.\"START\" = table_variants.\"POS\"
4673                                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
4674                                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
4675                                            )
4676                                        )
4677                            ;
4678                        """
4679
4680                    # Update
4681                    self.conn.execute(sql_query_update)
4682
4683                ### Annotate with VCF INFO field ###
4684
4685                # Init result VCF file
4686                output_results_vcf = os.path.join(output_results, "howard.vcf.gz")
4687
4688                # If VCF exists
4689                if os.path.exists(output_results_vcf):
4690
4691                    # Log
4692                    log.debug("Exomiser result VCF update variants")
4693
4694                    # Find Exomiser INFO field annotation in header
4695                    with gzip.open(output_results_vcf, "rt") as f:
4696                        header_list = self.read_vcf_header(f)
4697                    exomiser_vcf_header = vcf.Reader(
4698                        io.StringIO("\n".join(header_list))
4699                    )
4700
4701                    # Add annotation INFO field to header
4702                    vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"]
4703
4704                    # Update variants with VCF
4705                    self.update_from_vcf(output_results_vcf)
4706
4707        return True

This function annotate with Exomiser

This function uses args as parameters, in section "annotation" -> "exomiser", with sections:

  • "analysis" (dict/file): Full analysis dictionnary parameters (see Exomiser docs). Either a dict, or a file in JSON or YAML format. These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) Default : None
  • "preset" (string): Analysis preset (available in config folder). Used if no full "analysis" is provided. Default: "exome"
  • "phenopacket" (dict/file): Samples and phenotipic features parameters (see Exomiser docs). Either a dict, or a file in JSON or YAML format. Default: None
  • "subject" (dict): Sample parameters (see Exomiser docs). Example: "subject": { "id": "ISDBM322017", "sex": "FEMALE" } Default: None
  • "sample" (string): Sample name to construct "subject" section: "subject": { "id": "", "sex": "UNKNOWN_SEX" } Default: None
  • "phenotypicFeatures" (dict) Phenotypic features to construct "subject" section. Example: "phenotypicFeatures": [ { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, { "type": { "id": "HP:0000486", "label": "Strabismus" } } ]
  • "hpo" (list) List of HPO ids as phenotypic features. Example: "hpo": ['0001156', '0001363', '0011304', '0010055'] Default: []
  • "outputOptions" (dict): Output options (see Exomiser docs). Default: "output_options" = { "outputContributingVariantsOnly": False, "numGenes": 0, "outputFormats": ["TSV_VARIANT", "VCF"] }
  • "transcript_source" (string): Transcript source (either "refseq", "ucsc", "ensembl") Default: "refseq"
  • "exomiser_to_info" (boolean): Add exomiser TSV file columns as INFO fields in VCF. Default: False
  • "release" (string): Exomise database release. If not exists, database release will be downloaded (take a while). Default: None (provided by application.properties configuration file)
  • "exomiser_application_properties" (file): Exomiser configuration file (see Exomiser docs). Useful to automatically download databases (especially for specific genome databases).

Notes:

  • If no sample in parameters, first sample in VCF will be chosen
  • If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
Parameters
  • threads: The number of threads to use
Returns

None.

def annotation_snpeff(self, threads: int = None) -> None:
4709    def annotation_snpeff(self, threads: int = None) -> None:
4710        """
4711        This function annotate with snpEff
4712
4713        :param threads: The number of threads to use
4714        :return: the value of the variable "return_value".
4715        """
4716
4717        # DEBUG
4718        log.debug("Start annotation with snpeff databases")
4719
4720        # Threads
4721        if not threads:
4722            threads = self.get_threads()
4723        log.debug("Threads: " + str(threads))
4724
4725        # DEBUG
4726        delete_tmp = True
4727        if self.get_config().get("verbosity", "warning") in ["debug"]:
4728            delete_tmp = False
4729            log.debug("Delete tmp files/folders: " + str(delete_tmp))
4730
4731        # Config
4732        config = self.get_config()
4733        log.debug("Config: " + str(config))
4734
4735        # Config - Folders - Databases
4736        databases_folders = (
4737            config.get("folders", {}).get("databases", {}).get("snpeff", ["."])
4738        )
4739        log.debug("Databases annotations: " + str(databases_folders))
4740
4741        # # Config - Java
4742        # java_bin = get_bin(
4743        #     tool="java",
4744        #     bin="java",
4745        #     bin_type="bin",
4746        #     config=config,
4747        #     default_folder="/usr/bin",
4748        # )
4749        # if not (os.path.exists(java_bin) or (java_bin and which(java_bin))):
4750        #     log.error(f"Annotation failed: no java bin '{java_bin}'")
4751        #     raise ValueError(f"Annotation failed: no java bin '{java_bin}'")
4752
4753        # # Config - snpEff bin
4754        # snpeff_jar = get_bin(
4755        #     tool="snpeff",
4756        #     bin="snpEff.jar",
4757        #     bin_type="jar",
4758        #     config=config,
4759        #     default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
4760        # )
4761        # if not (os.path.exists(snpeff_jar) or (snpeff_jar and which(snpeff_jar))):
4762        #     log.error(f"Annotation failed: no snpEff jar '{snpeff_jar}'")
4763        #     raise ValueError(f"Annotation failed: no snpEff jar '{snpeff_jar}'")
4764
4765        # Config - snpEff bin command
4766        snpeff_bin_command = get_bin_command(
4767            bin="snpEff.jar",
4768            tool="snpeff",
4769            bin_type="jar",
4770            config=config,
4771            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
4772        )
4773        if not snpeff_bin_command:
4774            msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'"
4775            log.error(msg_err)
4776            raise ValueError(msg_err)
4777
4778        # Config - snpEff databases
4779        snpeff_databases = (
4780            config.get("folders", {})
4781            .get("databases", {})
4782            .get("snpeff", DEFAULT_SNPEFF_FOLDER)
4783        )
4784        snpeff_databases = full_path(snpeff_databases)
4785        if snpeff_databases is not None and snpeff_databases != "":
4786            log.debug(f"Create snpEff databases folder")
4787            if not os.path.exists(snpeff_databases):
4788                os.makedirs(snpeff_databases)
4789
4790        # Param
4791        param = self.get_param()
4792        log.debug("Param: " + str(param))
4793
4794        # Param
4795        options = param.get("annotation", {}).get("snpeff", {}).get("options", None)
4796        log.debug("Options: " + str(options))
4797
4798        # Param - Assembly
4799        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
4800
4801        # Param - Options
4802        snpeff_options = (
4803            param.get("annotation", {}).get("snpeff", {}).get("options", "")
4804        )
4805        snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None)
4806        snpeff_csvstats = (
4807            param.get("annotation", {}).get("snpeff", {}).get("csvStats", None)
4808        )
4809        if snpeff_stats:
4810            snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output())
4811            snpeff_stats = full_path(snpeff_stats)
4812            snpeff_options += f" -stats {snpeff_stats}"
4813        if snpeff_csvstats:
4814            snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output())
4815            snpeff_csvstats = full_path(snpeff_csvstats)
4816            snpeff_options += f" -csvStats {snpeff_csvstats}"
4817
4818        # Data
4819        table_variants = self.get_table_variants()
4820
4821        # Check if not empty
4822        log.debug("Check if not empty")
4823        sql_query_chromosomes = (
4824            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
4825        )
4826        # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]:
4827        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
4828            log.info(f"VCF empty")
4829            return
4830
4831        # Export in VCF
4832        log.debug("Create initial file to annotate")
4833        tmp_vcf = NamedTemporaryFile(
4834            prefix=self.get_prefix(),
4835            dir=self.get_tmp_dir(),
4836            suffix=".vcf.gz",
4837            delete=True,
4838        )
4839        tmp_vcf_name = tmp_vcf.name
4840
4841        # VCF header
4842        vcf_reader = self.get_header()
4843        log.debug("Initial header: " + str(vcf_reader.infos))
4844
4845        # Existing annotations
4846        for vcf_annotation in self.get_header().infos:
4847
4848            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
4849            log.debug(
4850                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
4851            )
4852
4853        # Memory limit
4854        # if config.get("memory", None):
4855        #     memory_limit = config.get("memory", "8G")
4856        # else:
4857        #     memory_limit = "8G"
4858        memory_limit = self.get_memory("8G")
4859        log.debug(f"memory_limit: {memory_limit}")
4860
4861        # snpEff java options
4862        snpeff_java_options = (
4863            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
4864        )
4865        log.debug(f"Exomiser java options: {snpeff_java_options}")
4866
4867        force_update_annotation = True
4868
4869        if "ANN" not in self.get_header().infos or force_update_annotation:
4870
4871            # Check snpEff database
4872            log.debug(f"Check snpEff databases {[assembly]}")
4873            databases_download_snpeff(
4874                folder=snpeff_databases, assemblies=[assembly], config=config
4875            )
4876
4877            # Export VCF file
4878            self.export_variant_vcf(
4879                vcf_file=tmp_vcf_name,
4880                remove_info=True,
4881                add_samples=False,
4882                index=True,
4883            )
4884
4885            # Tmp file
4886            err_files = []
4887            tmp_annotate_vcf = NamedTemporaryFile(
4888                prefix=self.get_prefix(),
4889                dir=self.get_tmp_dir(),
4890                suffix=".vcf",
4891                delete=False,
4892            )
4893            tmp_annotate_vcf_name = tmp_annotate_vcf.name
4894            tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
4895            err_files.append(tmp_annotate_vcf_name_err)
4896
4897            # Command
4898            snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}"
4899            log.debug(f"Annotation - snpEff command: {snpeff_command}")
4900            run_parallel_commands([snpeff_command], 1)
4901
4902            # Error messages
4903            log.info(f"Error/Warning messages:")
4904            error_message_command_all = []
4905            error_message_command_warning = []
4906            error_message_command_err = []
4907            for err_file in err_files:
4908                with open(err_file, "r") as f:
4909                    for line in f:
4910                        message = line.strip()
4911                        error_message_command_all.append(message)
4912                        if line.startswith("[W::"):
4913                            error_message_command_warning.append(message)
4914                        if line.startswith("[E::"):
4915                            error_message_command_err.append(f"{err_file}: " + message)
4916            # log info
4917            for message in list(
4918                set(error_message_command_err + error_message_command_warning)
4919            ):
4920                log.info(f"   {message}")
4921            # debug info
4922            for message in list(set(error_message_command_all)):
4923                log.debug(f"   {message}")
4924            # failed
4925            if len(error_message_command_err):
4926                log.error("Annotation failed: Error in commands")
4927                raise ValueError("Annotation failed: Error in commands")
4928
4929            # Find annotation in header
4930            with open(tmp_annotate_vcf_name, "rt") as f:
4931                header_list = self.read_vcf_header(f)
4932            annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
4933
4934            for ann in annovar_vcf_header.infos:
4935                if ann not in self.get_header().infos:
4936                    vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
4937
4938            # Update variants
4939            log.info(f"Annotation - Updating...")
4940            self.update_from_vcf(tmp_annotate_vcf_name)
4941
4942        else:
4943            if "ANN" in self.get_header().infos:
4944                log.debug(f"Existing snpEff annotations in VCF")
4945            if force_update_annotation:
4946                log.debug(f"Existing snpEff annotations in VCF - annotation forced")

This function annotate with snpEff

Parameters
  • threads: The number of threads to use
Returns

the value of the variable "return_value".

def annotation_annovar(self, threads: int = None) -> None:
4948    def annotation_annovar(self, threads: int = None) -> None:
4949        """
4950        It takes a VCF file, annotates it with Annovar, and then updates the database with the new
4951        annotations
4952
4953        :param threads: number of threads to use
4954        :return: the value of the variable "return_value".
4955        """
4956
4957        # DEBUG
4958        log.debug("Start annotation with Annovar databases")
4959
4960        # Threads
4961        if not threads:
4962            threads = self.get_threads()
4963        log.debug("Threads: " + str(threads))
4964
4965        # Tmp en Err files
4966        tmp_files = []
4967        err_files = []
4968
4969        # DEBUG
4970        delete_tmp = True
4971        if self.get_config().get("verbosity", "warning") in ["debug"]:
4972            delete_tmp = False
4973            log.debug("Delete tmp files/folders: " + str(delete_tmp))
4974
4975        # Config
4976        config = self.get_config()
4977        log.debug("Config: " + str(config))
4978
4979        # Config - Folders - Databases
4980        databases_folders = (
4981            config.get("folders", {}).get("databases", {}).get("annovar", ["."])
4982        )
4983        log.debug("Databases annotations: " + str(databases_folders))
4984
4985        # Config - annovar bin command
4986        annovar_bin_command = get_bin_command(
4987            bin="table_annovar.pl",
4988            tool="annovar",
4989            bin_type="perl",
4990            config=config,
4991            default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar",
4992        )
4993        if not annovar_bin_command:
4994            msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'"
4995            log.error(msg_err)
4996            raise ValueError(msg_err)
4997
4998        # Config - BCFTools bin command
4999        bcftools_bin_command = get_bin_command(
5000            bin="bcftools",
5001            tool="bcftools",
5002            bin_type="bin",
5003            config=config,
5004            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
5005        )
5006        if not bcftools_bin_command:
5007            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
5008            log.error(msg_err)
5009            raise ValueError(msg_err)
5010
5011        # Config - annovar databases
5012        annovar_databases = (
5013            config.get("folders", {})
5014            .get("databases", {})
5015            .get("annovar", DEFAULT_ANNOVAR_FOLDER)
5016        )
5017        annovar_databases = full_path(annovar_databases)
5018        if annovar_databases != "" and not os.path.exists(annovar_databases):
5019            os.makedirs(annovar_databases)
5020
5021        # Param
5022        param = self.get_param()
5023        log.debug("Param: " + str(param))
5024
5025        # Param - options
5026        options = param.get("annotation", {}).get("annovar", {}).get("options", {})
5027        log.debug("Options: " + str(options))
5028
5029        # Param - annotations
5030        annotations = (
5031            param.get("annotation", {}).get("annovar", {}).get("annotations", {})
5032        )
5033        log.debug("Annotations: " + str(annotations))
5034
5035        # Param - Assembly
5036        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
5037
5038        # Annovar database assembly
5039        annovar_databases_assembly = f"{annovar_databases}/{assembly}"
5040        if annovar_databases_assembly != "" and not os.path.exists(
5041            annovar_databases_assembly
5042        ):
5043            os.makedirs(annovar_databases_assembly)
5044
5045        # Data
5046        table_variants = self.get_table_variants()
5047
5048        # Check if not empty
5049        log.debug("Check if not empty")
5050        sql_query_chromosomes = (
5051            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
5052        )
5053        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
5054        if not sql_query_chromosomes_df["count"][0]:
5055            log.info(f"VCF empty")
5056            return
5057
5058        # VCF header
5059        vcf_reader = self.get_header()
5060        log.debug("Initial header: " + str(vcf_reader.infos))
5061
5062        # Existing annotations
5063        for vcf_annotation in self.get_header().infos:
5064
5065            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
5066            log.debug(
5067                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
5068            )
5069
5070        force_update_annotation = True
5071
5072        if annotations:
5073
5074            commands = []
5075            tmp_annotates_vcf_name_list = []
5076
5077            # Export in VCF
5078            log.debug("Create initial file to annotate")
5079            tmp_vcf = NamedTemporaryFile(
5080                prefix=self.get_prefix(),
5081                dir=self.get_tmp_dir(),
5082                suffix=".vcf.gz",
5083                delete=False,
5084            )
5085            tmp_vcf_name = tmp_vcf.name
5086            tmp_files.append(tmp_vcf_name)
5087            tmp_files.append(tmp_vcf_name + ".tbi")
5088
5089            # Export VCF file
5090            self.export_variant_vcf(
5091                vcf_file=tmp_vcf_name,
5092                remove_info=".",
5093                add_samples=False,
5094                index=True,
5095            )
5096
5097            # Create file for field rename
5098            log.debug("Create file for field rename")
5099            tmp_rename = NamedTemporaryFile(
5100                prefix=self.get_prefix(),
5101                dir=self.get_tmp_dir(),
5102                suffix=".rename",
5103                delete=False,
5104            )
5105            tmp_rename_name = tmp_rename.name
5106            tmp_files.append(tmp_rename_name)
5107
5108            # Check Annovar database
5109            log.debug(
5110                f"Check Annovar databases {[assembly]}: {list(annotations.keys())}"
5111            )
5112            databases_download_annovar(
5113                folder=annovar_databases,
5114                files=list(annotations.keys()),
5115                assemblies=[assembly],
5116            )
5117
5118            for annotation in annotations:
5119                annotation_fields = annotations[annotation]
5120
5121                if not annotation_fields:
5122                    annotation_fields = {"INFO": None}
5123
5124                log.info(f"Annotations Annovar - database '{annotation}'")
5125                log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}")
5126
5127                # Tmp file for annovar
5128                err_files = []
5129                tmp_annotate_vcf_directory = TemporaryDirectory(
5130                    prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar"
5131                )
5132                tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar"
5133                tmp_annotate_vcf_name_annovar = (
5134                    tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf"
5135                )
5136                tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err"
5137                err_files.append(tmp_annotate_vcf_name_err)
5138                tmp_files.append(tmp_annotate_vcf_name_err)
5139
5140                # Tmp file final vcf annotated by annovar
5141                tmp_annotate_vcf = NamedTemporaryFile(
5142                    prefix=self.get_prefix(),
5143                    dir=self.get_tmp_dir(),
5144                    suffix=".vcf.gz",
5145                    delete=False,
5146                )
5147                tmp_annotate_vcf_name = tmp_annotate_vcf.name
5148                tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name)
5149                tmp_files.append(tmp_annotate_vcf_name)
5150                tmp_files.append(tmp_annotate_vcf_name + ".tbi")
5151
5152                # Number of fields
5153                annotation_list = []
5154                annotation_renamed_list = []
5155
5156                for annotation_field in annotation_fields:
5157
5158                    # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
5159                    annotation_fields_new_name = annotation_fields.get(
5160                        annotation_field, annotation_field
5161                    )
5162                    if not annotation_fields_new_name:
5163                        annotation_fields_new_name = annotation_field
5164
5165                    if (
5166                        force_update_annotation
5167                        or annotation_fields_new_name not in self.get_header().infos
5168                    ):
5169                        annotation_list.append(annotation_field)
5170                        annotation_renamed_list.append(annotation_fields_new_name)
5171                    else:  # annotation_fields_new_name in self.get_header().infos and not force_update_annotation:
5172                        log.warning(
5173                            f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
5174                        )
5175
5176                    # Add rename info
5177                    run_parallel_commands(
5178                        [
5179                            f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}"
5180                        ],
5181                        1,
5182                    )
5183
5184                # log.debug("fields_to_removed: " + str(fields_to_removed))
5185                log.debug("annotation_list: " + str(annotation_list))
5186
5187                # protocol
5188                protocol = annotation
5189
5190                # argument
5191                argument = ""
5192
5193                # operation
5194                operation = "f"
5195                if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith(
5196                    "ensGene"
5197                ):
5198                    operation = "g"
5199                    if options.get("genebase", None):
5200                        argument = f"""'{options.get("genebase","")}'"""
5201                elif annotation in ["cytoBand"]:
5202                    operation = "r"
5203
5204                # argument option
5205                argument_option = ""
5206                if argument != "":
5207                    argument_option = " --argument " + argument
5208
5209                # command options
5210                command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """  # --intronhgvs 10
5211                for option in options:
5212                    if option not in ["genebase"]:
5213                        command_options += f""" --{option}={options[option]}"""
5214
5215                # Command
5216
5217                # Command - Annovar
5218                command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """
5219                tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf")
5220
5221                # Command - start pipe
5222                command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """
5223
5224                # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!)
5225                command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """
5226
5227                # Command - Special characters (refGene annotation)
5228                command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """
5229
5230                # Command - Clean empty fields (with value ".")
5231                command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """
5232
5233                # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file
5234                annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"]
5235                if "ALL" not in annotation_list and "INFO" not in annotation_list:
5236                    # for ann in annotation_renamed_list:
5237                    for ann in annotation_list:
5238                        annovar_fields_to_keep.append(f"^INFO/{ann}")
5239
5240                command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """
5241
5242                # Command - indexing
5243                command_annovar += f"""  && tabix {tmp_annotate_vcf_name} """
5244
5245                log.debug(f"Annotation - Annovar command: {command_annovar}")
5246                run_parallel_commands([command_annovar], 1)
5247
5248                # Error messages
5249                log.info(f"Error/Warning messages:")
5250                error_message_command_all = []
5251                error_message_command_warning = []
5252                error_message_command_err = []
5253                for err_file in err_files:
5254                    with open(err_file, "r") as f:
5255                        for line in f:
5256                            message = line.strip()
5257                            error_message_command_all.append(message)
5258                            if line.startswith("[W::") or line.startswith("WARNING"):
5259                                error_message_command_warning.append(message)
5260                            if line.startswith("[E::") or line.startswith("ERROR"):
5261                                error_message_command_err.append(
5262                                    f"{err_file}: " + message
5263                                )
5264                # log info
5265                for message in list(
5266                    set(error_message_command_err + error_message_command_warning)
5267                ):
5268                    log.info(f"   {message}")
5269                # debug info
5270                for message in list(set(error_message_command_all)):
5271                    log.debug(f"   {message}")
5272                # failed
5273                if len(error_message_command_err):
5274                    log.error("Annotation failed: Error in commands")
5275                    raise ValueError("Annotation failed: Error in commands")
5276
5277            if tmp_annotates_vcf_name_list:
5278
5279                # List of annotated files
5280                tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list)
5281
5282                # Tmp file
5283                tmp_annotate_vcf = NamedTemporaryFile(
5284                    prefix=self.get_prefix(),
5285                    dir=self.get_tmp_dir(),
5286                    suffix=".vcf.gz",
5287                    delete=False,
5288                )
5289                tmp_annotate_vcf_name = tmp_annotate_vcf.name
5290                tmp_files.append(tmp_annotate_vcf_name)
5291                tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
5292                err_files.append(tmp_annotate_vcf_name_err)
5293                tmp_files.append(tmp_annotate_vcf_name_err)
5294
5295                # Command merge
5296                merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} "
5297                log.info(
5298                    f"Annotation Annovar - Annotation merging "
5299                    + str(len(tmp_annotates_vcf_name_list))
5300                    + " annotated files"
5301                )
5302                log.debug(f"Annotation - merge command: {merge_command}")
5303                run_parallel_commands([merge_command], 1)
5304
5305                # Find annotation in header
5306                with bgzf.open(tmp_annotate_vcf_name, "rt") as f:
5307                    header_list = self.read_vcf_header(f)
5308                annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
5309
5310                for ann in annovar_vcf_header.infos:
5311                    if ann not in self.get_header().infos:
5312                        vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
5313
5314                # Update variants
5315                log.info(f"Annotation Annovar - Updating...")
5316                self.update_from_vcf(tmp_annotate_vcf_name)
5317
5318            # Clean files
5319            # Tmp file remove command
5320            if True:
5321                tmp_files_remove_command = ""
5322                if tmp_files:
5323                    tmp_files_remove_command = " ".join(tmp_files)
5324                clean_command = f" rm -f {tmp_files_remove_command} "
5325                log.debug(f"Annotation Annovar - Annotation cleaning ")
5326                log.debug(f"Annotation - cleaning command: {clean_command}")
5327                run_parallel_commands([clean_command], 1)

It takes a VCF file, annotates it with Annovar, and then updates the database with the new annotations

Parameters
  • threads: number of threads to use
Returns

the value of the variable "return_value".

def annotation_parquet(self, threads: int = None) -> None:
5330    def annotation_parquet(self, threads: int = None) -> None:
5331        """
5332        It takes a VCF file, and annotates it with a parquet file
5333
5334        :param threads: number of threads to use for the annotation
5335        :return: the value of the variable "result".
5336        """
5337
5338        # DEBUG
5339        log.debug("Start annotation with parquet databases")
5340
5341        # Threads
5342        if not threads:
5343            threads = self.get_threads()
5344        log.debug("Threads: " + str(threads))
5345
5346        # DEBUG
5347        delete_tmp = True
5348        if self.get_config().get("verbosity", "warning") in ["debug"]:
5349            delete_tmp = False
5350            log.debug("Delete tmp files/folders: " + str(delete_tmp))
5351
5352        # Config
5353        databases_folders = set(
5354            self.get_config()
5355            .get("folders", {})
5356            .get("databases", {})
5357            .get("annotations", ["."])
5358            + self.get_config()
5359            .get("folders", {})
5360            .get("databases", {})
5361            .get("parquet", ["."])
5362        )
5363        log.debug("Databases annotations: " + str(databases_folders))
5364
5365        # Param
5366        annotations = (
5367            self.get_param()
5368            .get("annotation", {})
5369            .get("parquet", {})
5370            .get("annotations", None)
5371        )
5372        log.debug("Annotations: " + str(annotations))
5373
5374        # Assembly
5375        assembly = self.get_param().get(
5376            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
5377        )
5378
5379        # Force Update Annotation
5380        force_update_annotation = (
5381            self.get_param()
5382            .get("annotation", {})
5383            .get("options", {})
5384            .get("annotations_update", False)
5385        )
5386        log.debug(f"force_update_annotation={force_update_annotation}")
5387        force_append_annotation = (
5388            self.get_param()
5389            .get("annotation", {})
5390            .get("options", {})
5391            .get("annotations_append", False)
5392        )
5393        log.debug(f"force_append_annotation={force_append_annotation}")
5394
5395        # Data
5396        table_variants = self.get_table_variants()
5397
5398        # Check if not empty
5399        log.debug("Check if not empty")
5400        sql_query_chromosomes_df = self.get_query_to_df(
5401            f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1"""
5402        )
5403        if not sql_query_chromosomes_df["count"][0]:
5404            log.info(f"VCF empty")
5405            return
5406
5407        # VCF header
5408        vcf_reader = self.get_header()
5409        log.debug("Initial header: " + str(vcf_reader.infos))
5410
5411        # Nb Variants POS
5412        log.debug("NB Variants Start")
5413        nb_variants = self.conn.execute(
5414            f"SELECT count(*) AS count FROM variants"
5415        ).fetchdf()["count"][0]
5416        log.debug("NB Variants Stop")
5417
5418        # Existing annotations
5419        for vcf_annotation in self.get_header().infos:
5420
5421            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
5422            log.debug(
5423                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
5424            )
5425
5426        # Added columns
5427        added_columns = []
5428
5429        # drop indexes
5430        log.debug(f"Drop indexes...")
5431        self.drop_indexes()
5432
5433        if annotations:
5434
5435            if "ALL" in annotations:
5436
5437                all_param = annotations.get("ALL", {})
5438                all_param_formats = all_param.get("formats", None)
5439                all_param_releases = all_param.get("releases", None)
5440
5441                databases_infos_dict = self.scan_databases(
5442                    database_formats=all_param_formats,
5443                    database_releases=all_param_releases,
5444                )
5445                for database_infos in databases_infos_dict.keys():
5446                    if database_infos not in annotations:
5447                        annotations[database_infos] = {"INFO": None}
5448
5449            for annotation in annotations:
5450
5451                if annotation in ["ALL"]:
5452                    continue
5453
5454                # Annotation Name
5455                annotation_name = os.path.basename(annotation)
5456
5457                # Annotation fields
5458                annotation_fields = annotations[annotation]
5459                if not annotation_fields:
5460                    annotation_fields = {"INFO": None}
5461
5462                log.debug(f"Annotation '{annotation_name}'")
5463                log.debug(
5464                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
5465                )
5466
5467                # Create Database
5468                database = Database(
5469                    database=annotation,
5470                    databases_folders=databases_folders,
5471                    assembly=assembly,
5472                )
5473
5474                # Find files
5475                parquet_file = database.get_database()
5476                parquet_hdr_file = database.get_header_file()
5477                parquet_type = database.get_type()
5478
5479                # Check if files exists
5480                if not parquet_file or not parquet_hdr_file:
5481                    log.error("Annotation failed: file not found")
5482                    raise ValueError("Annotation failed: file not found")
5483                else:
5484                    # Get parquet connexion
5485                    parquet_sql_attach = database.get_sql_database_attach(
5486                        output="query"
5487                    )
5488                    if parquet_sql_attach:
5489                        self.conn.execute(parquet_sql_attach)
5490                    parquet_file_link = database.get_sql_database_link()
5491                    # Log
5492                    log.debug(
5493                        f"Annotation '{annotation_name}' - file: "
5494                        + str(parquet_file)
5495                        + " and "
5496                        + str(parquet_hdr_file)
5497                    )
5498
5499                    # Database full header columns
5500                    parquet_hdr_vcf_header_columns = database.get_header_file_columns(
5501                        parquet_hdr_file
5502                    )
5503                    # Log
5504                    log.debug(
5505                        "Annotation database header columns : "
5506                        + str(parquet_hdr_vcf_header_columns)
5507                    )
5508
5509                    # Load header as VCF object
5510                    parquet_hdr_vcf_header_infos = database.get_header().infos
5511                    # Log
5512                    log.debug(
5513                        "Annotation database header: "
5514                        + str(parquet_hdr_vcf_header_infos)
5515                    )
5516
5517                    # Get extra infos
5518                    parquet_columns = database.get_extra_columns()
5519                    # Log
5520                    log.debug("Annotation database Columns: " + str(parquet_columns))
5521
5522                    # Add extra columns if "ALL" in annotation_fields
5523                    # if "ALL" in annotation_fields:
5524                    #     allow_add_extra_column = True
5525                    if "ALL" in annotation_fields and database.get_extra_columns():
5526                        for extra_column in database.get_extra_columns():
5527                            if (
5528                                extra_column not in annotation_fields
5529                                and extra_column.replace("INFO/", "")
5530                                not in parquet_hdr_vcf_header_infos
5531                            ):
5532                                parquet_hdr_vcf_header_infos[extra_column] = (
5533                                    vcf.parser._Info(
5534                                        extra_column,
5535                                        ".",
5536                                        "String",
5537                                        f"{extra_column} description",
5538                                        "unknown",
5539                                        "unknown",
5540                                        self.code_type_map["String"],
5541                                    )
5542                                )
5543
5544                    # For all fields in database
5545                    annotation_fields_all = False
5546                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
5547                        annotation_fields_all = True
5548                        annotation_fields = {
5549                            key: key for key in parquet_hdr_vcf_header_infos
5550                        }
5551
5552                        log.debug(
5553                            "Annotation database header - All annotations added: "
5554                            + str(annotation_fields)
5555                        )
5556
5557                    # Init
5558
5559                    # List of annotation fields to use
5560                    sql_query_annotation_update_info_sets = []
5561
5562                    # List of annotation to agregate
5563                    sql_query_annotation_to_agregate = []
5564
5565                    # Number of fields
5566                    nb_annotation_field = 0
5567
5568                    # Annotation fields processed
5569                    annotation_fields_processed = []
5570
5571                    # Columns mapping
5572                    map_columns = database.map_columns(
5573                        columns=annotation_fields, prefixes=["INFO/"]
5574                    )
5575
5576                    # Query dict for fields to remove (update option)
5577                    query_dict_remove = {}
5578
5579                    # Fetch Anotation fields
5580                    for annotation_field in annotation_fields:
5581
5582                        # annotation_field_column
5583                        annotation_field_column = map_columns.get(
5584                            annotation_field, "INFO"
5585                        )
5586
5587                        # field new name, if parametered
5588                        annotation_fields_new_name = annotation_fields.get(
5589                            annotation_field, annotation_field
5590                        )
5591                        if not annotation_fields_new_name:
5592                            annotation_fields_new_name = annotation_field
5593
5594                        # To annotate
5595                        # force_update_annotation = True
5596                        # force_append_annotation = True
5597                        # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)):
5598                        if annotation_field in parquet_hdr_vcf_header_infos and (
5599                            force_update_annotation
5600                            or force_append_annotation
5601                            or (
5602                                annotation_fields_new_name
5603                                not in self.get_header().infos
5604                            )
5605                        ):
5606
5607                            # Add field to annotation to process list
5608                            annotation_fields_processed.append(
5609                                annotation_fields_new_name
5610                            )
5611
5612                            # explode infos for the field
5613                            annotation_fields_new_name_info_msg = ""
5614                            if (
5615                                force_update_annotation
5616                                and annotation_fields_new_name
5617                                in self.get_header().infos
5618                            ):
5619                                # Remove field from INFO
5620                                query = f"""
5621                                    UPDATE {table_variants} as table_variants
5622                                    SET INFO = REGEXP_REPLACE(
5623                                                concat(table_variants.INFO,''),
5624                                                ';*{annotation_fields_new_name}=[^;]*',
5625                                                ''
5626                                                )
5627                                    WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%'
5628                                """
5629                                annotation_fields_new_name_info_msg = " [update]"
5630                                query_dict_remove[
5631                                    f"remove 'INFO/{annotation_fields_new_name}'"
5632                                ] = query
5633
5634                            # Sep between fields in INFO
5635                            nb_annotation_field += 1
5636                            if nb_annotation_field > 1:
5637                                annotation_field_sep = ";"
5638                            else:
5639                                annotation_field_sep = ""
5640
5641                            log.info(
5642                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}"
5643                            )
5644
5645                            # Add INFO field to header
5646                            parquet_hdr_vcf_header_infos_number = (
5647                                parquet_hdr_vcf_header_infos[annotation_field].num
5648                                or "."
5649                            )
5650                            parquet_hdr_vcf_header_infos_type = (
5651                                parquet_hdr_vcf_header_infos[annotation_field].type
5652                                or "String"
5653                            )
5654                            parquet_hdr_vcf_header_infos_description = (
5655                                parquet_hdr_vcf_header_infos[annotation_field].desc
5656                                or f"{annotation_field} description"
5657                            )
5658                            parquet_hdr_vcf_header_infos_source = (
5659                                parquet_hdr_vcf_header_infos[annotation_field].source
5660                                or "unknown"
5661                            )
5662                            parquet_hdr_vcf_header_infos_version = (
5663                                parquet_hdr_vcf_header_infos[annotation_field].version
5664                                or "unknown"
5665                            )
5666
5667                            vcf_reader.infos[annotation_fields_new_name] = (
5668                                vcf.parser._Info(
5669                                    annotation_fields_new_name,
5670                                    parquet_hdr_vcf_header_infos_number,
5671                                    parquet_hdr_vcf_header_infos_type,
5672                                    parquet_hdr_vcf_header_infos_description,
5673                                    parquet_hdr_vcf_header_infos_source,
5674                                    parquet_hdr_vcf_header_infos_version,
5675                                    self.code_type_map[
5676                                        parquet_hdr_vcf_header_infos_type
5677                                    ],
5678                                )
5679                            )
5680
5681                            # Append
5682                            if force_append_annotation:
5683                                query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """
5684                            else:
5685                                query_case_when_append = ""
5686
5687                            # Annotation/Update query fields
5688                            # Found in INFO column
5689                            if (
5690                                annotation_field_column == "INFO"
5691                                and "INFO" in parquet_hdr_vcf_header_columns
5692                            ):
5693                                sql_query_annotation_update_info_sets.append(
5694                                    f"""
5695                                CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append}
5696                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1))
5697                                        ELSE ''
5698                                    END
5699                                """
5700                                )
5701                            # Found in a specific column
5702                            else:
5703                                sql_query_annotation_update_info_sets.append(
5704                                    f"""
5705                                CASE WHEN table_parquet."{annotation_field_column}" NOT IN ('','.') {query_case_when_append}
5706                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(table_parquet."{annotation_field_column}", ';', ','))
5707                                        ELSE ''
5708                                    END
5709                                """
5710                                )
5711                                sql_query_annotation_to_agregate.append(
5712                                    f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """
5713                                )
5714
5715                        # Not to annotate
5716                        else:
5717
5718                            if force_update_annotation:
5719                                annotation_message = "forced"
5720                            else:
5721                                annotation_message = "skipped"
5722
5723                            if annotation_field not in parquet_hdr_vcf_header_infos:
5724                                log.warning(
5725                                    f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file"
5726                                )
5727                            if annotation_fields_new_name in self.get_header().infos:
5728                                log.warning(
5729                                    f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})"
5730                                )
5731
5732                    # Check if ALL fields have to be annotated. Thus concat all INFO field
5733                    # allow_annotation_full_info = True
5734                    allow_annotation_full_info = not force_append_annotation
5735
5736                    if parquet_type in ["regions"]:
5737                        allow_annotation_full_info = False
5738
5739                    if (
5740                        allow_annotation_full_info
5741                        and nb_annotation_field == len(annotation_fields)
5742                        and annotation_fields_all
5743                        and (
5744                            "INFO" in parquet_hdr_vcf_header_columns
5745                            and "INFO" in database.get_extra_columns()
5746                        )
5747                    ):
5748                        log.debug("Column INFO annotation enabled")
5749                        sql_query_annotation_update_info_sets = []
5750                        sql_query_annotation_update_info_sets.append(
5751                            f" table_parquet.INFO "
5752                        )
5753
5754                    if sql_query_annotation_update_info_sets:
5755
5756                        # Annotate
5757                        log.info(f"Annotation '{annotation_name}' - Annotation...")
5758
5759                        # Join query annotation update info sets for SQL
5760                        sql_query_annotation_update_info_sets_sql = ",".join(
5761                            sql_query_annotation_update_info_sets
5762                        )
5763
5764                        # Check chromosomes list (and variants infos)
5765                        sql_query_chromosomes = f"""
5766                            SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants
5767                            FROM {table_variants} as table_variants
5768                            GROUP BY table_variants."#CHROM"
5769                            ORDER BY table_variants."#CHROM"
5770                            """
5771                        sql_query_chromosomes_df = self.conn.execute(
5772                            sql_query_chromosomes
5773                        ).df()
5774                        sql_query_chromosomes_dict = {
5775                            entry["CHROM"]: {
5776                                "count": entry["count_variants"],
5777                                "min": entry["min_variants"],
5778                                "max": entry["max_variants"],
5779                            }
5780                            for index, entry in sql_query_chromosomes_df.iterrows()
5781                        }
5782
5783                        # Init
5784                        nb_of_query = 0
5785                        nb_of_variant_annotated = 0
5786                        query_dict = query_dict_remove
5787
5788                        # for chrom in sql_query_chromosomes_df["CHROM"]:
5789                        for chrom in sql_query_chromosomes_dict:
5790
5791                            # Number of variant by chromosome
5792                            nb_of_variant_by_chrom = sql_query_chromosomes_dict.get(
5793                                chrom, {}
5794                            ).get("count", 0)
5795
5796                            log.debug(
5797                                f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..."
5798                            )
5799
5800                            # Annotation with regions database
5801                            if parquet_type in ["regions"]:
5802                                sql_query_annotation_from_clause = f"""
5803                                    FROM (
5804                                        SELECT 
5805                                            '{chrom}' AS \"#CHROM\",
5806                                            table_variants_from.\"POS\" AS \"POS\",
5807                                            {",".join(sql_query_annotation_to_agregate)}
5808                                        FROM {table_variants} as table_variants_from
5809                                        LEFT JOIN {parquet_file_link} as table_parquet_from ON (
5810                                            table_parquet_from."#CHROM" = '{chrom}'
5811                                            AND table_variants_from.\"POS\" <= table_parquet_from.\"END\"
5812                                            AND (table_variants_from.\"POS\" >= (table_parquet_from.\"START\"+1)
5813                                                OR table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1)
5814                                                )
5815                                        )
5816                                        WHERE table_variants_from.\"#CHROM\" in ('{chrom}')
5817                                        GROUP BY table_variants_from.\"POS\"
5818                                        )
5819                                        as table_parquet
5820                                """
5821
5822                                sql_query_annotation_where_clause = """
5823                                    table_parquet.\"#CHROM\" = table_variants.\"#CHROM\"
5824                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
5825                                """
5826
5827                            # Annotation with variants database
5828                            else:
5829                                sql_query_annotation_from_clause = f"""
5830                                    FROM {parquet_file_link} as table_parquet
5831                                """
5832                                sql_query_annotation_where_clause = f"""
5833                                    table_variants."#CHROM" = '{chrom}'
5834                                    AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 
5835                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
5836                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
5837                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
5838                                """
5839
5840                            # Create update query
5841                            sql_query_annotation_chrom_interval_pos = f"""
5842                                UPDATE {table_variants} as table_variants
5843                                    SET INFO = 
5844                                        concat(
5845                                            CASE WHEN table_variants.INFO NOT IN ('','.')
5846                                                THEN table_variants.INFO
5847                                                ELSE ''
5848                                            END
5849                                            ,
5850                                            CASE WHEN table_variants.INFO NOT IN ('','.')
5851                                                        AND (
5852                                                        concat({sql_query_annotation_update_info_sets_sql})
5853                                                        )
5854                                                        NOT IN ('','.') 
5855                                                    THEN ';'
5856                                                    ELSE ''
5857                                            END
5858                                            ,
5859                                            {sql_query_annotation_update_info_sets_sql}
5860                                            )
5861                                    {sql_query_annotation_from_clause}
5862                                    WHERE {sql_query_annotation_where_clause}
5863                                    ;
5864                                """
5865
5866                            # Add update query to dict
5867                            query_dict[
5868                                f"{chrom} [{nb_of_variant_by_chrom} variants]"
5869                            ] = sql_query_annotation_chrom_interval_pos
5870
5871                        nb_of_query = len(query_dict)
5872                        num_query = 0
5873
5874                        # SET max_expression_depth TO x
5875                        self.conn.execute("SET max_expression_depth TO 10000")
5876
5877                        for query_name in query_dict:
5878                            query = query_dict[query_name]
5879                            num_query += 1
5880                            log.info(
5881                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..."
5882                            )
5883                            result = self.conn.execute(query)
5884                            nb_of_variant_annotated_by_query = result.df()["Count"][0]
5885                            nb_of_variant_annotated += nb_of_variant_annotated_by_query
5886                            log.info(
5887                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated"
5888                            )
5889
5890                        log.info(
5891                            f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)"
5892                        )
5893
5894                    else:
5895
5896                        log.info(
5897                            f"Annotation '{annotation_name}' - No Annotations available"
5898                        )
5899
5900                    log.debug("Final header: " + str(vcf_reader.infos))
5901
5902        # Remove added columns
5903        for added_column in added_columns:
5904            self.drop_column(column=added_column)

It takes a VCF file, and annotates it with a parquet file

Parameters
  • threads: number of threads to use for the annotation
Returns

the value of the variable "result".

def annotation_splice(self, threads: int = None) -> None:
5906    def annotation_splice(self, threads: int = None) -> None:
5907        """
5908        This function annotate with snpEff
5909
5910        :param threads: The number of threads to use
5911        :return: the value of the variable "return_value".
5912        """
5913
5914        # DEBUG
5915        log.debug("Start annotation with splice tools")
5916
5917        # Threads
5918        if not threads:
5919            threads = self.get_threads()
5920        log.debug("Threads: " + str(threads))
5921
5922        # DEBUG
5923        delete_tmp = True
5924        if self.get_config().get("verbosity", "warning") in ["debug"]:
5925            delete_tmp = False
5926            log.debug("Delete tmp files/folders: " + str(delete_tmp))
5927
5928        # Config
5929        config = self.get_config()
5930        log.debug("Config: " + str(config))
5931        splice_config = config.get("tools", {}).get("splice", {})
5932        if not splice_config:
5933            splice_config = DEFAULT_TOOLS_BIN.get("splice", {})
5934        if not splice_config:
5935            msg_err = "No Splice tool config"
5936            log.error(msg_err)
5937            raise ValueError(msg_err)
5938        log.debug(f"splice_config={splice_config}")
5939
5940        # Config - Folders - Databases
5941        databases_folders = (
5942            config.get("folders", {}).get("databases", {}).get("splice", ["."])
5943        )
5944        log.debug("Databases annotations: " + str(databases_folders))
5945
5946        # Splice docker image
5947        splice_docker_image = splice_config.get("docker").get("image")
5948
5949        # Pull splice image if it's not already there
5950        if not check_docker_image_exists(splice_docker_image):
5951            log.warning(
5952                f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub"
5953            )
5954            try:
5955                command(f"docker pull {splice_config.get('docker').get('image')}")
5956            except subprocess.CalledProcessError:
5957                msg_err = f"Unable to find docker {splice_docker_image} on dockerhub"
5958                log.error(msg_err)
5959                raise ValueError(msg_err)
5960                return None
5961
5962        # Config - splice databases
5963        splice_databases = (
5964            config.get("folders", {})
5965            .get("databases", {})
5966            .get("splice", DEFAULT_SPLICE_FOLDER)
5967        )
5968        splice_databases = full_path(splice_databases)
5969
5970        # Param
5971        param = self.get_param()
5972        log.debug("Param: " + str(param))
5973
5974        # Param
5975        options = param.get("annotation", {}).get("splice", {})
5976        log.debug("Options: " + str(options))
5977
5978        # Data
5979        table_variants = self.get_table_variants()
5980
5981        # Check if not empty
5982        log.debug("Check if not empty")
5983        sql_query_chromosomes = (
5984            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
5985        )
5986        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
5987            log.info("VCF empty")
5988            return None
5989
5990        # Export in VCF
5991        log.debug("Create initial file to annotate")
5992
5993        # Create output folder
5994        output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}")
5995        if not os.path.exists(output_folder):
5996            Path(output_folder).mkdir(parents=True, exist_ok=True)
5997
5998        # Create tmp VCF file
5999        tmp_vcf = NamedTemporaryFile(
6000            prefix=self.get_prefix(),
6001            dir=output_folder,
6002            suffix=".vcf",
6003            delete=False,
6004        )
6005        tmp_vcf_name = tmp_vcf.name
6006
6007        # VCF header
6008        header = self.get_header()
6009
6010        # Existing annotations
6011        for vcf_annotation in self.get_header().infos:
6012
6013            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
6014            log.debug(
6015                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
6016            )
6017
6018        # Memory limit
6019        if config.get("memory", None):
6020            memory_limit = config.get("memory", "8G").upper()
6021            # upper()
6022        else:
6023            memory_limit = "8G"
6024        log.debug(f"memory_limit: {memory_limit}")
6025
6026        # Check number of variants to annotate
6027        where_clause_regex_spliceai = r"SpliceAI_\w+"
6028        where_clause_regex_spip = r"SPiP_\w+"
6029        where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')"""
6030        df_list_of_variants_to_annotate = self.get_query_to_df(
6031            query=f""" SELECT * FROM variants {where_clause} """
6032        )
6033        if len(df_list_of_variants_to_annotate) == 0:
6034            log.warning(
6035                f"No variants to annotate with splice. Variants probably already annotated with splice"
6036            )
6037            return None
6038        else:
6039            log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants")
6040
6041        # Export VCF file
6042        self.export_variant_vcf(
6043            vcf_file=tmp_vcf_name,
6044            remove_info=True,
6045            add_samples=True,
6046            index=False,
6047            where_clause=where_clause,
6048        )
6049
6050        # Create docker container and launch splice analysis
6051        if splice_config:
6052
6053            # Splice mount folders
6054            mount_folders = splice_config.get("mount", {})
6055
6056            # Genome mount
6057            mount_folders[
6058                config.get("folders", {})
6059                .get("databases", {})
6060                .get("genomes", DEFAULT_GENOME_FOLDER)
6061            ] = "ro"
6062
6063            # SpliceAI mount
6064            mount_folders[
6065                config.get("folders", {})
6066                .get("databases", {})
6067                .get("spliceai", DEFAULT_SPLICEAI_FOLDER)
6068            ] = "ro"
6069
6070            # Genome mount
6071            mount_folders[
6072                config.get("folders", {})
6073                .get("databases", {})
6074                .get("spip", DEFAULT_SPIP_FOLDER)
6075            ] = "ro"
6076
6077            # Mount folders
6078            mount = []
6079
6080            # Config mount
6081            mount = [
6082                f"-v {full_path(path)}:{full_path(path)}:{mode}"
6083                for path, mode in mount_folders.items()
6084            ]
6085
6086            if any(value for value in splice_config.values() if value is None):
6087                log.warning("At least one splice config parameter is empty")
6088                return None
6089
6090            # Params in splice nf
6091            def check_values(dico: dict):
6092                """
6093                Ensure parameters for NF splice pipeline
6094                """
6095                for key, val in dico.items():
6096                    if key == "genome":
6097                        if any(
6098                            assemb in options.get("genome", {})
6099                            for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"]
6100                        ):
6101                            yield f"--{key} hg19"
6102                        elif any(
6103                            assemb in options.get("genome", {})
6104                            for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"]
6105                        ):
6106                            yield f"--{key} hg38"
6107                    elif (
6108                        (isinstance(val, str) and val)
6109                        or isinstance(val, int)
6110                        or isinstance(val, bool)
6111                    ):
6112                        yield f"--{key} {val}"
6113
6114            # Genome
6115            genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY))
6116            options["genome"] = genome
6117
6118            # NF params
6119            nf_params = []
6120
6121            # Add options
6122            if options:
6123                nf_params = list(check_values(options))
6124                log.debug(f"Splice NF params: {' '.join(nf_params)}")
6125            else:
6126                log.debug("No NF params provided")
6127
6128            # Add threads
6129            if "threads" not in options.keys():
6130                nf_params.append(f"--threads {threads}")
6131
6132            # Genome path
6133            genome_path = find_genome(
6134                config.get("folders", {})
6135                .get("databases", {})
6136                .get("genomes", DEFAULT_GENOME_FOLDER),
6137                file=f"{genome}.fa",
6138            )
6139            # Add genome path
6140            if not genome_path:
6141                raise ValueError(
6142                    f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}"
6143                )
6144            else:
6145                log.debug(f"Genome: {genome_path}")
6146                nf_params.append(f"--genome_path {genome_path}")
6147
6148            def splice_annotations(options: dict = {}, config: dict = {}) -> list:
6149                """
6150                Setting up updated databases for SPiP and SpliceAI
6151                """
6152
6153                try:
6154
6155                    # SpliceAI assembly transcriptome
6156                    spliceai_assembly = os.path.join(
6157                        config.get("folders", {})
6158                        .get("databases", {})
6159                        .get("spliceai", {}),
6160                        options.get("genome"),
6161                        "transcriptome",
6162                    )
6163                    spip_assembly = options.get("genome")
6164
6165                    spip = find(
6166                        f"transcriptome_{spip_assembly}.RData",
6167                        config.get("folders", {}).get("databases", {}).get("spip", {}),
6168                    )
6169                    spliceai = find("spliceai.refseq.txt", spliceai_assembly)
6170                    log.debug(f"SPiP annotations: {spip}")
6171                    log.debug(f"SpliceAI annotations: {spliceai}")
6172                    if spip and spliceai:
6173                        return [
6174                            f"--spip_transcriptome {spip}",
6175                            f"--spliceai_annotations {spliceai}",
6176                        ]
6177                    else:
6178                        # TODO crash and go on with basic annotations ?
6179                        # raise ValueError(
6180                        #     "Can't find splice databases in configuration EXIT"
6181                        # )
6182                        log.warning(
6183                            "Can't find splice databases in configuration, use annotations file from image"
6184                        )
6185                except TypeError:
6186                    log.warning(
6187                        "Can't find splice databases in configuration, use annotations file from image"
6188                    )
6189                    return []
6190
6191            # Add options, check if transcriptome option have already beend provided
6192            if (
6193                "spip_transcriptome" not in nf_params
6194                and "spliceai_transcriptome" not in nf_params
6195            ):
6196                splice_reference = splice_annotations(options, config)
6197                if splice_reference:
6198                    nf_params.extend(splice_reference)
6199
6200            nf_params.append(f"--output_folder {output_folder}")
6201
6202            random_uuid = f"HOWARD-SPLICE-{get_random()}"
6203            cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline"
6204            log.debug(cmd)
6205
6206            splice_config["docker"]["command"] = cmd
6207
6208            docker_cmd = get_bin_command(
6209                tool="splice",
6210                bin_type="docker",
6211                config=config,
6212                default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker",
6213                add_options=f"--name {random_uuid} {' '.join(mount)}",
6214            )
6215
6216            # Docker debug
6217            # if splice_config.get("rm_container"):
6218            #     rm_container = "--rm"
6219            # else:
6220            #     rm_container = ""
6221            # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}"
6222
6223            log.debug(docker_cmd)
6224            res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True)
6225            log.debug(res.stdout)
6226            if res.stderr:
6227                log.error(res.stderr)
6228            res.check_returncode()
6229        else:
6230            log.warning(f"Splice tool configuration not found: {config}")
6231
6232        # Update variants
6233        log.info("Annotation - Updating...")
6234        # Test find output vcf
6235        log.debug(
6236            f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
6237        )
6238        output_vcf = []
6239        # Wrong folder to look in
6240        for files in os.listdir(os.path.dirname(tmp_vcf_name)):
6241            if (
6242                files
6243                == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
6244            ):
6245                output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files))
6246        # log.debug(os.listdir(options.get("output_folder")))
6247        log.debug(f"Splice annotated vcf: {output_vcf[0]}")
6248        if not output_vcf:
6249            log.debug(
6250                f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz"
6251            )
6252        else:
6253            # Get new header from annotated vcf
6254            log.debug(f"Initial header: {len(header.infos)} fields")
6255            # Create new header with splice infos
6256            new_vcf = Variants(input=output_vcf[0])
6257            new_vcf_header = new_vcf.get_header().infos
6258            for keys, infos in new_vcf_header.items():
6259                if keys not in header.infos.keys():
6260                    header.infos[keys] = infos
6261            log.debug(f"New header: {len(header.infos)} fields")
6262            log.debug(f"Splice tmp output: {output_vcf[0]}")
6263            self.update_from_vcf(output_vcf[0])
6264
6265        # Remove folder
6266        remove_if_exists(output_folder)

This function annotate with snpEff

Parameters
  • threads: The number of threads to use
Returns

the value of the variable "return_value".

def get_config_default(self, name: str) -> dict:
6272    def get_config_default(self, name: str) -> dict:
6273        """
6274        The function `get_config_default` returns a dictionary containing default configurations for
6275        various calculations and prioritizations.
6276
6277        :param name: The `get_config_default` function returns a dictionary containing default
6278        configurations for different calculations and prioritizations. The `name` parameter is used to
6279        specify which specific configuration to retrieve from the dictionary
6280        :type name: str
6281        :return: The function `get_config_default` returns a dictionary containing default configuration
6282        settings for different calculations and prioritizations. The specific configuration settings are
6283        retrieved based on the input `name` parameter provided to the function. If the `name` parameter
6284        matches a key in the `config_default` dictionary, the corresponding configuration settings are
6285        returned. If there is no match, an empty dictionary is returned.
6286        """
6287
6288        config_default = {
6289            "calculations": {
6290                "variant_chr_pos_alt_ref": {
6291                    "type": "sql",
6292                    "name": "variant_chr_pos_alt_ref",
6293                    "description": "Create a variant ID with chromosome, position, alt and ref",
6294                    "available": False,
6295                    "output_column_name": "variant_chr_pos_alt_ref",
6296                    "output_column_type": "String",
6297                    "output_column_description": "variant ID with chromosome, position, alt and ref",
6298                    "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """,
6299                    "operation_info": True,
6300                },
6301                "VARTYPE": {
6302                    "type": "sql",
6303                    "name": "VARTYPE",
6304                    "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)",
6305                    "available": True,
6306                    "output_column_name": "VARTYPE",
6307                    "output_column_type": "String",
6308                    "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ",
6309                    "operation_query": """
6310                            CASE
6311                                WHEN "SVTYPE" NOT NULL THEN "SVTYPE"
6312                                WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV'
6313                                WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC'
6314                                WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV'
6315                                WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL'
6316                                ELSE 'UNDEFINED'
6317                            END
6318                            """,
6319                    "info_fields": ["SVTYPE"],
6320                    "operation_info": True,
6321                },
6322                "snpeff_hgvs": {
6323                    "type": "python",
6324                    "name": "snpeff_hgvs",
6325                    "description": "HGVS nomenclatures from snpEff annotation",
6326                    "available": True,
6327                    "function_name": "calculation_extract_snpeff_hgvs",
6328                    "function_params": ["snpeff_hgvs", "ANN"],
6329                },
6330                "snpeff_ann_explode": {
6331                    "type": "python",
6332                    "name": "snpeff_ann_explode",
6333                    "description": "Explode snpEff annotations with uniquify values",
6334                    "available": True,
6335                    "function_name": "calculation_snpeff_ann_explode",
6336                    "function_params": [False, "fields", "snpeff_", "ANN"],
6337                },
6338                "snpeff_ann_explode_uniquify": {
6339                    "type": "python",
6340                    "name": "snpeff_ann_explode_uniquify",
6341                    "description": "Explode snpEff annotations",
6342                    "available": True,
6343                    "function_name": "calculation_snpeff_ann_explode",
6344                    "function_params": [True, "fields", "snpeff_uniquify_", "ANN"],
6345                },
6346                "snpeff_ann_explode_json": {
6347                    "type": "python",
6348                    "name": "snpeff_ann_explode_json",
6349                    "description": "Explode snpEff annotations in JSON format",
6350                    "available": True,
6351                    "function_name": "calculation_snpeff_ann_explode",
6352                    "function_params": [False, "JSON", "snpeff_json", "ANN"],
6353                },
6354                "NOMEN": {
6355                    "type": "python",
6356                    "name": "NOMEN",
6357                    "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field",
6358                    "available": True,
6359                    "function_name": "calculation_extract_nomen",
6360                    "function_params": [],
6361                },
6362                "FINDBYPIPELINE": {
6363                    "type": "python",
6364                    "name": "FINDBYPIPELINE",
6365                    "description": "Number of pipeline that identify the variant (for multi pipeline VCF)",
6366                    "available": True,
6367                    "function_name": "calculation_find_by_pipeline",
6368                    "function_params": ["findbypipeline"],
6369                },
6370                "FINDBYSAMPLE": {
6371                    "type": "python",
6372                    "name": "FINDBYSAMPLE",
6373                    "description": "Number of sample that have a genotype for the variant (for multi sample VCF)",
6374                    "available": True,
6375                    "function_name": "calculation_find_by_pipeline",
6376                    "function_params": ["findbysample"],
6377                },
6378                "GENOTYPECONCORDANCE": {
6379                    "type": "python",
6380                    "name": "GENOTYPECONCORDANCE",
6381                    "description": "Concordance of genotype for multi caller VCF",
6382                    "available": True,
6383                    "function_name": "calculation_genotype_concordance",
6384                    "function_params": [],
6385                },
6386                "BARCODE": {
6387                    "type": "python",
6388                    "name": "BARCODE",
6389                    "description": "BARCODE as VaRank tool",
6390                    "available": True,
6391                    "function_name": "calculation_barcode",
6392                    "function_params": [],
6393                },
6394                "BARCODEFAMILY": {
6395                    "type": "python",
6396                    "name": "BARCODEFAMILY",
6397                    "description": "BARCODEFAMILY as VaRank tool",
6398                    "available": True,
6399                    "function_name": "calculation_barcode_family",
6400                    "function_params": ["BCF"],
6401                },
6402                "TRIO": {
6403                    "type": "python",
6404                    "name": "TRIO",
6405                    "description": "Inheritance for a trio family",
6406                    "available": True,
6407                    "function_name": "calculation_trio",
6408                    "function_params": [],
6409                },
6410                "VAF": {
6411                    "type": "python",
6412                    "name": "VAF",
6413                    "description": "Variant Allele Frequency (VAF) harmonization",
6414                    "available": True,
6415                    "function_name": "calculation_vaf_normalization",
6416                    "function_params": [],
6417                },
6418                "VAF_stats": {
6419                    "type": "python",
6420                    "name": "VAF_stats",
6421                    "description": "Variant Allele Frequency (VAF) statistics",
6422                    "available": True,
6423                    "function_name": "calculation_genotype_stats",
6424                    "function_params": ["VAF"],
6425                },
6426                "DP_stats": {
6427                    "type": "python",
6428                    "name": "DP_stats",
6429                    "description": "Depth (DP) statistics",
6430                    "available": True,
6431                    "function_name": "calculation_genotype_stats",
6432                    "function_params": ["DP"],
6433                },
6434                "variant_id": {
6435                    "type": "python",
6436                    "name": "variant_id",
6437                    "description": "Variant ID generated from variant position and type",
6438                    "available": True,
6439                    "function_name": "calculation_variant_id",
6440                    "function_params": [],
6441                },
6442                "transcripts_json": {
6443                    "type": "python",
6444                    "name": "transcripts_json",
6445                    "description": "Add transcripts info in JSON format (field 'transcripts_json')",
6446                    "available": True,
6447                    "function_name": "calculation_transcripts_json",
6448                    "function_params": ["transcripts_json"],
6449                },
6450            },
6451            "prioritizations": {
6452                "default": {
6453                    "filter": [
6454                        {
6455                            "type": "notequals",
6456                            "value": "!PASS|\\.",
6457                            "score": 0,
6458                            "flag": "FILTERED",
6459                            "comment": ["Bad variant quality"],
6460                        },
6461                        {
6462                            "type": "equals",
6463                            "value": "REJECT",
6464                            "score": -20,
6465                            "flag": "PASS",
6466                            "comment": ["Bad variant quality"],
6467                        },
6468                    ],
6469                    "DP": [
6470                        {
6471                            "type": "gte",
6472                            "value": "50",
6473                            "score": 5,
6474                            "flag": "PASS",
6475                            "comment": ["DP higher than 50"],
6476                        }
6477                    ],
6478                    "ANN": [
6479                        {
6480                            "type": "contains",
6481                            "value": "HIGH",
6482                            "score": 5,
6483                            "flag": "PASS",
6484                            "comment": [
6485                                "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay"
6486                            ],
6487                        },
6488                        {
6489                            "type": "contains",
6490                            "value": "MODERATE",
6491                            "score": 3,
6492                            "flag": "PASS",
6493                            "comment": [
6494                                "A non-disruptive variant that might change protein effectiveness"
6495                            ],
6496                        },
6497                        {
6498                            "type": "contains",
6499                            "value": "LOW",
6500                            "score": 0,
6501                            "flag": "FILTERED",
6502                            "comment": [
6503                                "Assumed to be mostly harmless or unlikely to change protein behavior"
6504                            ],
6505                        },
6506                        {
6507                            "type": "contains",
6508                            "value": "MODIFIER",
6509                            "score": 0,
6510                            "flag": "FILTERED",
6511                            "comment": [
6512                                "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact"
6513                            ],
6514                        },
6515                    ],
6516                }
6517            },
6518        }
6519
6520        return config_default.get(name, None)

The function get_config_default returns a dictionary containing default configurations for various calculations and prioritizations.

Parameters
  • name: The get_config_default function returns a dictionary containing default configurations for different calculations and prioritizations. The name parameter is used to specify which specific configuration to retrieve from the dictionary
Returns

The function get_config_default returns a dictionary containing default configuration settings for different calculations and prioritizations. The specific configuration settings are retrieved based on the input name parameter provided to the function. If the name parameter matches a key in the config_default dictionary, the corresponding configuration settings are returned. If there is no match, an empty dictionary is returned.

def get_config_json(self, name: str, config_dict: dict = {}, config_file: str = None) -> dict:
6522    def get_config_json(
6523        self, name: str, config_dict: dict = {}, config_file: str = None
6524    ) -> dict:
6525        """
6526        The function `get_config_json` retrieves a configuration JSON object with prioritizations from
6527        default values, a dictionary, and a file.
6528
6529        :param name: The `name` parameter in the `get_config_json` function is a string that represents
6530        the name of the configuration. It is used to identify and retrieve the configuration settings
6531        for a specific component or module
6532        :type name: str
6533        :param config_dict: The `config_dict` parameter in the `get_config_json` function is a
6534        dictionary that allows you to provide additional configuration settings or overrides. When you
6535        call the `get_config_json` function, you can pass a dictionary containing key-value pairs where
6536        the key is the configuration setting you want to override or
6537        :type config_dict: dict
6538        :param config_file: The `config_file` parameter in the `get_config_json` function is used to
6539        specify the path to a configuration file that contains additional settings. If provided, the
6540        function will read the contents of this file and update the configuration dictionary with the
6541        values found in the file, overriding any existing values with the
6542        :type config_file: str
6543        :return: The function `get_config_json` returns a dictionary containing the configuration
6544        settings.
6545        """
6546
6547        # Create with default prioritizations
6548        config_default = self.get_config_default(name=name)
6549        configuration = config_default
6550        # log.debug(f"configuration={configuration}")
6551
6552        # Replace prioritizations from dict
6553        for config in config_dict:
6554            configuration[config] = config_dict[config]
6555
6556        # Replace prioritizations from file
6557        config_file = full_path(config_file)
6558        if config_file:
6559            if os.path.exists(config_file):
6560                with open(config_file) as config_file_content:
6561                    config_file_dict = json.load(config_file_content)
6562                for config in config_file_dict:
6563                    configuration[config] = config_file_dict[config]
6564            else:
6565                msg_error = f"Config '{name}' file '{config_file}' does NOT exist"
6566                log.error(msg_error)
6567                raise ValueError(msg_error)
6568
6569        return configuration

The function get_config_json retrieves a configuration JSON object with prioritizations from default values, a dictionary, and a file.

Parameters
  • name: The name parameter in the get_config_json function is a string that represents the name of the configuration. It is used to identify and retrieve the configuration settings for a specific component or module
  • config_dict: The config_dict parameter in the get_config_json function is a dictionary that allows you to provide additional configuration settings or overrides. When you call the get_config_json function, you can pass a dictionary containing key-value pairs where the key is the configuration setting you want to override or
  • config_file: The config_file parameter in the get_config_json function is used to specify the path to a configuration file that contains additional settings. If provided, the function will read the contents of this file and update the configuration dictionary with the values found in the file, overriding any existing values with the
Returns

The function get_config_json returns a dictionary containing the configuration settings.

def prioritization(self) -> None:
6571    def prioritization(self) -> None:
6572        """
6573        It takes a VCF file, and adds a bunch of new INFO fields to it, based on the values of other
6574        INFO fields
6575        """
6576
6577        # Config
6578        config = self.get_config()
6579
6580        # Param
6581        param = self.get_param()
6582
6583        # Quick Prioritizations
6584        # prioritizations = param.get("prioritization", {}).get("prioritizations", "")
6585
6586        # Configuration profiles
6587        prioritization_config_file = param.get("prioritization", {}).get(
6588            "prioritization_config", None
6589        )
6590        prioritization_config_file = full_path(prioritization_config_file)
6591        prioritizations_config = self.get_config_json(
6592            name="prioritizations", config_file=prioritization_config_file
6593        )
6594
6595        # Prioritization options
6596        profiles = param.get("prioritization", {}).get("profiles", [])
6597        if isinstance(profiles, str):
6598            profiles = profiles.split(",")
6599        pzfields = param.get("prioritization", {}).get(
6600            "pzfields", ["PZFlag", "PZScore"]
6601        )
6602        if isinstance(pzfields, str):
6603            pzfields = pzfields.split(",")
6604        default_profile = param.get("prioritization", {}).get("default_profile", None)
6605        pzfields_sep = param.get("prioritization", {}).get("pzfields_sep", "_")
6606        prioritization_score_mode = param.get("prioritization", {}).get(
6607            "prioritization_score_mode", "HOWARD"
6608        )
6609
6610        # Quick Prioritizations
6611        # prioritizations = param.get("prioritization", {}).get("prioritizations", None)
6612        prioritizations = param.get("prioritizations", None)
6613        if prioritizations:
6614            log.info("Quick Prioritization:")
6615            for profile in prioritizations.split(","):
6616                if profile not in profiles:
6617                    profiles.append(profile)
6618                    log.info(f"   {profile}")
6619
6620        # If profile "ALL" provided, all profiles in the config profiles
6621        if "ALL" in profiles:
6622            profiles = list(prioritizations_config.keys())
6623
6624        for profile in profiles:
6625            if prioritizations_config.get(profile, None):
6626                log.debug(f"Profile '{profile}' configured")
6627            else:
6628                msg_error = f"Profile '{profile}' NOT configured"
6629                log.error(msg_error)
6630                raise ValueError(msg_error)
6631
6632        if profiles:
6633            log.info(f"Prioritization... ")
6634        else:
6635            log.debug(f"No profile defined")
6636            return
6637
6638        if not default_profile and len(profiles):
6639            default_profile = profiles[0]
6640
6641        log.debug("Profiles availables: " + str(list(prioritizations_config.keys())))
6642        log.debug("Profiles to check: " + str(list(profiles)))
6643
6644        # Variables
6645        table_variants = self.get_table_variants(clause="update")
6646
6647        # Added columns
6648        added_columns = []
6649
6650        # Create list of PZfields
6651        # List of PZFields
6652        list_of_pzfields_original = pzfields + [
6653            pzfield + pzfields_sep + profile
6654            for pzfield in pzfields
6655            for profile in profiles
6656        ]
6657        list_of_pzfields = []
6658        log.debug(f"{list_of_pzfields_original}")
6659
6660        # Remove existing PZfields to use if exists
6661        for pzfield in list_of_pzfields_original:
6662            if self.get_header().infos.get(pzfield, None) is None:
6663                list_of_pzfields.append(pzfield)
6664                log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF")
6665            else:
6666                log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF")
6667
6668        if list_of_pzfields:
6669
6670            # Explode Infos fields
6671            explode_infos_prefix = self.get_explode_infos_prefix()
6672            added_columns += self.explode_infos(prefix=explode_infos_prefix)
6673            extra_infos = self.get_extra_infos()
6674
6675            # PZfields tags description
6676            PZfields_INFOS = {
6677                "PZTags": {
6678                    "ID": "PZTags",
6679                    "Number": ".",
6680                    "Type": "String",
6681                    "Description": "Variant tags based on annotation criteria",
6682                },
6683                "PZScore": {
6684                    "ID": "PZScore",
6685                    "Number": 1,
6686                    "Type": "Integer",
6687                    "Description": "Variant score based on annotation criteria",
6688                },
6689                "PZFlag": {
6690                    "ID": "PZFlag",
6691                    "Number": 1,
6692                    "Type": "String",
6693                    "Description": "Variant flag based on annotation criteria",
6694                },
6695                "PZComment": {
6696                    "ID": "PZComment",
6697                    "Number": ".",
6698                    "Type": "String",
6699                    "Description": "Variant comment based on annotation criteria",
6700                },
6701                "PZInfos": {
6702                    "ID": "PZInfos",
6703                    "Number": ".",
6704                    "Type": "String",
6705                    "Description": "Variant infos based on annotation criteria",
6706                },
6707            }
6708
6709            # Create INFO fields if not exist
6710            for field in PZfields_INFOS:
6711                field_ID = PZfields_INFOS[field]["ID"]
6712                field_description = PZfields_INFOS[field]["Description"]
6713                if field_ID not in self.get_header().infos and field_ID in pzfields:
6714                    field_description = (
6715                        PZfields_INFOS[field]["Description"]
6716                        + f", profile {default_profile}"
6717                    )
6718                    self.get_header().infos[field_ID] = vcf.parser._Info(
6719                        field_ID,
6720                        PZfields_INFOS[field]["Number"],
6721                        PZfields_INFOS[field]["Type"],
6722                        field_description,
6723                        "unknown",
6724                        "unknown",
6725                        code_type_map[PZfields_INFOS[field]["Type"]],
6726                    )
6727
6728            # Create INFO fields if not exist for each profile
6729            for profile in prioritizations_config:
6730                if profile in profiles or profiles == []:
6731                    for field in PZfields_INFOS:
6732                        field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile
6733                        field_description = (
6734                            PZfields_INFOS[field]["Description"]
6735                            + f", profile {profile}"
6736                        )
6737                        if (
6738                            field_ID not in self.get_header().infos
6739                            and field in pzfields
6740                        ):
6741                            self.get_header().infos[field_ID] = vcf.parser._Info(
6742                                field_ID,
6743                                PZfields_INFOS[field]["Number"],
6744                                PZfields_INFOS[field]["Type"],
6745                                field_description,
6746                                "unknown",
6747                                "unknown",
6748                                code_type_map[PZfields_INFOS[field]["Type"]],
6749                            )
6750
6751            # Header
6752            for pzfield in list_of_pzfields:
6753                if re.match("PZScore.*", pzfield):
6754                    added_column = self.add_column(
6755                        table_name=table_variants,
6756                        column_name=pzfield,
6757                        column_type="INTEGER",
6758                        default_value="0",
6759                    )
6760                elif re.match("PZFlag.*", pzfield):
6761                    added_column = self.add_column(
6762                        table_name=table_variants,
6763                        column_name=pzfield,
6764                        column_type="BOOLEAN",
6765                        default_value="1",
6766                    )
6767                else:
6768                    added_column = self.add_column(
6769                        table_name=table_variants,
6770                        column_name=pzfield,
6771                        column_type="STRING",
6772                        default_value="''",
6773                    )
6774                added_columns.append(added_column)
6775
6776            # Profiles
6777            if profiles:
6778
6779                # foreach profile in configuration file
6780                for profile in prioritizations_config:
6781
6782                    # If profile is asked in param, or ALL are asked (empty profile [])
6783                    if profile in profiles or profiles == []:
6784                        log.info(f"Profile '{profile}'")
6785
6786                        sql_set_info_option = ""
6787
6788                        sql_set_info = []
6789
6790                        # PZ fields set
6791
6792                        # PZScore
6793                        if f"PZScore{pzfields_sep}{profile}" in list_of_pzfields:
6794                            sql_set_info.append(
6795                                f"""
6796                                    concat(
6797                                        'PZScore{pzfields_sep}{profile}=',
6798                                        PZScore{pzfields_sep}{profile}
6799                                    ) 
6800                                """
6801                            )
6802                            if (
6803                                profile == default_profile
6804                                and "PZScore" in list_of_pzfields
6805                            ):
6806                                sql_set_info.append(
6807                                    f"""
6808                                        concat(
6809                                            'PZScore=',
6810                                            PZScore{pzfields_sep}{profile}
6811                                        )
6812                                    """
6813                                )
6814
6815                        # PZFlag
6816                        if f"PZFlag{pzfields_sep}{profile}" in list_of_pzfields:
6817                            sql_set_info.append(
6818                                f"""
6819                                    concat(
6820                                        'PZFlag{pzfields_sep}{profile}=',
6821                                        CASE 
6822                                            WHEN PZFlag{pzfields_sep}{profile}==1
6823                                            THEN 'PASS'
6824                                            WHEN PZFlag{pzfields_sep}{profile}==0
6825                                            THEN 'FILTERED'
6826                                        END
6827                                    ) 
6828                                """
6829                            )
6830                            if (
6831                                profile == default_profile
6832                                and "PZFlag" in list_of_pzfields
6833                            ):
6834                                sql_set_info.append(
6835                                    f"""
6836                                        concat(
6837                                            'PZFlag=',
6838                                            CASE 
6839                                                WHEN PZFlag{pzfields_sep}{profile}==1
6840                                                THEN 'PASS'
6841                                                WHEN PZFlag{pzfields_sep}{profile}==0
6842                                                THEN 'FILTERED'
6843                                            END
6844                                        )
6845                                    """
6846                                )
6847
6848                        # PZComment
6849                        if f"PZComment{pzfields_sep}{profile}" in list_of_pzfields:
6850                            sql_set_info.append(
6851                                f"""
6852                                    CASE
6853                                        WHEN PZComment{pzfields_sep}{profile} NOT IN ('')
6854                                        THEN concat('PZComment{pzfields_sep}{profile}=', PZComment{pzfields_sep}{profile})
6855                                        ELSE ''
6856                                    END
6857                                """
6858                            )
6859                            if (
6860                                profile == default_profile
6861                                and "PZComment" in list_of_pzfields
6862                            ):
6863                                sql_set_info.append(
6864                                    f"""
6865                                        CASE
6866                                            WHEN PZComment{pzfields_sep}{profile} NOT IN ('')
6867                                            THEN concat('PZComment=', PZComment{pzfields_sep}{profile})
6868                                            ELSE ''
6869                                        END
6870                                    """
6871                                )
6872
6873                        # PZInfos
6874                        if f"PZInfos{pzfields_sep}{profile}" in list_of_pzfields:
6875                            sql_set_info.append(
6876                                f"""
6877                                    CASE
6878                                        WHEN PZInfos{pzfields_sep}{profile} NOT IN ('')
6879                                        THEN concat('PZInfos{pzfields_sep}{profile}=', PZInfos{pzfields_sep}{profile})
6880                                        ELSE ''
6881                                    END
6882                                """
6883                            )
6884                            if (
6885                                profile == default_profile
6886                                and "PZInfos" in list_of_pzfields
6887                            ):
6888                                sql_set_info.append(
6889                                    f"""
6890                                        CASE
6891                                            WHEN PZInfos{pzfields_sep}{profile} NOT IN ('')
6892                                            THEN concat('PZInfos=', PZInfos{pzfields_sep}{profile})
6893                                            ELSE ''
6894                                        END
6895                                    """
6896                                )
6897
6898                        # Merge PZfields
6899                        sql_set_info_option = ""
6900                        sql_set_sep = ""
6901                        for sql_set in sql_set_info:
6902                            if sql_set_sep:
6903                                sql_set_info_option += f"""
6904                                    , concat('{sql_set_sep}', {sql_set})
6905                                """
6906                            else:
6907                                sql_set_info_option += f"""
6908                                    , {sql_set}
6909                                """
6910                            sql_set_sep = ";"
6911
6912                        sql_queries = []
6913                        for annotation in prioritizations_config[profile]:
6914
6915                            # Check if annotation field is present
6916                            if not f"{explode_infos_prefix}{annotation}" in extra_infos:
6917                                log.debug(f"Annotation '{annotation}' not in data")
6918                                continue
6919                            else:
6920                                log.debug(f"Annotation '{annotation}' in data")
6921
6922                            # For each criterions
6923                            for criterion in prioritizations_config[profile][
6924                                annotation
6925                            ]:
6926                                criterion_type = criterion["type"]
6927                                criterion_value = criterion["value"]
6928                                criterion_score = criterion.get("score", 0)
6929                                criterion_flag = criterion.get("flag", "PASS")
6930                                criterion_flag_bool = criterion_flag == "PASS"
6931                                criterion_comment = (
6932                                    ", ".join(criterion.get("comment", []))
6933                                    .replace("'", "''")
6934                                    .replace(";", ",")
6935                                    .replace("\t", " ")
6936                                )
6937                                criterion_infos = (
6938                                    str(criterion)
6939                                    .replace("'", "''")
6940                                    .replace(";", ",")
6941                                    .replace("\t", " ")
6942                                )
6943
6944                                sql_set = []
6945                                sql_set_info = []
6946
6947                                # PZ fields set
6948                                if (
6949                                    f"PZScore{pzfields_sep}{profile}"
6950                                    in list_of_pzfields
6951                                ):
6952                                    if prioritization_score_mode == "HOWARD":
6953                                        sql_set.append(
6954                                            f"PZScore{pzfields_sep}{profile} = PZScore{pzfields_sep}{profile} + {criterion_score}"
6955                                        )
6956                                    elif prioritization_score_mode == "VaRank":
6957                                        sql_set.append(
6958                                            f"PZScore{pzfields_sep}{profile} = CASE WHEN {criterion_score}>PZScore{pzfields_sep}{profile} THEN {criterion_score} END"
6959                                        )
6960                                    else:
6961                                        sql_set.append(
6962                                            f"PZScore{pzfields_sep}{profile} = PZScore{pzfields_sep}{profile} + {criterion_score}"
6963                                        )
6964                                if f"PZFlag{pzfields_sep}{profile}" in list_of_pzfields:
6965                                    sql_set.append(
6966                                        f"PZFlag{pzfields_sep}{profile} = PZFlag{pzfields_sep}{profile} AND {criterion_flag_bool}"
6967                                    )
6968                                if (
6969                                    f"PZComment{pzfields_sep}{profile}"
6970                                    in list_of_pzfields
6971                                ):
6972                                    sql_set.append(
6973                                        f"""
6974                                            PZComment{pzfields_sep}{profile} = 
6975                                                concat(
6976                                                    PZComment{pzfields_sep}{profile},
6977                                                    CASE 
6978                                                        WHEN PZComment{pzfields_sep}{profile}!=''
6979                                                        THEN ', '
6980                                                        ELSE ''
6981                                                    END,
6982                                                    '{criterion_comment}'
6983                                                )
6984                                        """
6985                                    )
6986                                if (
6987                                    f"PZInfos{pzfields_sep}{profile}"
6988                                    in list_of_pzfields
6989                                ):
6990                                    sql_set.append(
6991                                        f"""
6992                                            PZInfos{pzfields_sep}{profile} = 
6993                                                concat(
6994                                                    PZInfos{pzfields_sep}{profile},
6995                                                    '{criterion_infos}'
6996                                                )
6997                                        """
6998                                    )
6999                                sql_set_option = ",".join(sql_set)
7000
7001                                # Criterion and comparison
7002                                try:
7003                                    float(criterion_value)
7004                                    sql_update = f"""
7005                                        UPDATE {table_variants}
7006                                        SET {sql_set_option}
7007                                        WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.')
7008                                        AND "{explode_infos_prefix}{annotation}"{comparison_map[criterion_type]}{criterion_value}
7009                                        """
7010                                except:
7011                                    contains_option = ""
7012                                    if criterion_type == "contains":
7013                                        contains_option = ".*"
7014                                    sql_update = f"""
7015                                        UPDATE {table_variants}
7016                                        SET {sql_set_option}
7017                                        WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}'
7018                                        """
7019                                sql_queries.append(sql_update)
7020
7021                        # PZTags
7022                        if f"PZTags{pzfields_sep}{profile}" in list_of_pzfields:
7023
7024                            # Create PZFalgs value
7025                            pztags_value = ""
7026                            pztags_sep_default = "|"
7027                            pztags_sep = ""
7028                            for pzfield in pzfields:
7029                                if pzfield not in ["PZTags"]:
7030                                    if (
7031                                        f"{pzfield}{pzfields_sep}{profile}"
7032                                        in list_of_pzfields
7033                                    ):
7034                                        if pzfield in ["PZFlag"]:
7035                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
7036                                                CASE WHEN PZFlag{pzfields_sep}{profile}
7037                                                    THEN 'PASS'
7038                                                    ELSE 'FILTERED'
7039                                                END, '"""
7040                                        else:
7041                                            pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '"
7042                                        pztags_sep = pztags_sep_default
7043
7044                            # Add Query update for PZFlags
7045                            sql_update_pztags = f"""
7046                                UPDATE {table_variants}
7047                                SET INFO = concat(
7048                                        INFO,
7049                                        CASE WHEN INFO NOT in ('','.')
7050                                                THEN ';'
7051                                                ELSE ''
7052                                        END,
7053                                        'PZTags{pzfields_sep}{profile}={pztags_value}'
7054                                    )
7055                                """
7056                            sql_queries.append(sql_update_pztags)
7057
7058                            # Add Query update for PZFlags for default
7059                            if profile == default_profile:
7060                                sql_update_pztags_default = f"""
7061                                UPDATE {table_variants}
7062                                SET INFO = concat(
7063                                        INFO,
7064                                        ';',
7065                                        'PZTags={pztags_value}'
7066                                    )
7067                                """
7068                                sql_queries.append(sql_update_pztags_default)
7069
7070                        log.info(f"""Profile '{profile}' - Prioritization... """)
7071
7072                        if sql_queries:
7073
7074                            for sql_query in sql_queries:
7075                                log.debug(
7076                                    f"""Profile '{profile}' - Prioritization query: {sql_query}... """
7077                                )
7078                                self.conn.execute(sql_query)
7079
7080                        log.info(f"""Profile '{profile}' - Update... """)
7081                        sql_query_update = f"""
7082                            UPDATE {table_variants}
7083                            SET INFO =  
7084                                concat(
7085                                    CASE
7086                                        WHEN INFO NOT IN ('','.')
7087                                        THEN concat(INFO, ';')
7088                                        ELSE ''
7089                                    END
7090                                    {sql_set_info_option}
7091                                )
7092                        """
7093                        self.conn.execute(sql_query_update)
7094
7095        else:
7096
7097            log.warning(f"No profiles in parameters")
7098
7099        # Remove added columns
7100        for added_column in added_columns:
7101            self.drop_column(column=added_column)
7102
7103        # Explode INFOS fields into table fields
7104        if self.get_explode_infos():
7105            self.explode_infos(
7106                prefix=self.get_explode_infos_prefix(),
7107                fields=self.get_explode_infos_fields(),
7108                force=True,
7109            )
7110
7111        return

It takes a VCF file, and adds a bunch of new INFO fields to it, based on the values of other INFO fields

def annotation_hgvs(self, threads: int = None) -> None:
7117    def annotation_hgvs(self, threads: int = None) -> None:
7118        """
7119        The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic
7120        coordinates and alleles.
7121
7122        :param threads: The `threads` parameter is an optional integer that specifies the number of
7123        threads to use for parallel processing. If no value is provided, it will default to the number
7124        of threads obtained from the `get_threads()` method
7125        :type threads: int
7126        """
7127
7128        # Function for each partition of the Dask Dataframe
7129        def partition_function(partition):
7130            """
7131            The function `partition_function` applies the `annotation_hgvs_partition` function to
7132            each row of a DataFrame called `partition`.
7133
7134            :param partition: The parameter "partition" is a pandas DataFrame that contains the data
7135            to be processed
7136            :return: the result of applying the "annotation_hgvs_partition" function to each row of
7137            the "partition" dataframe along the axis 1.
7138            """
7139            return partition.apply(annotation_hgvs_partition, axis=1)
7140
7141        def annotation_hgvs_partition(row) -> str:
7142            """
7143            The function `annotation_hgvs_partition` takes in a row of data and returns a string
7144            containing a list of HGVS names associated with the given genomic coordinates and alleles.
7145
7146            :param row: A dictionary-like object that contains the values for the following keys:
7147            :return: a string that contains the HGVS names associated with the given row of data.
7148            """
7149
7150            chr = row["CHROM"]
7151            pos = row["POS"]
7152            ref = row["REF"]
7153            alt = row["ALT"]
7154
7155            # Find list of associated transcripts
7156            transcripts_list = list(
7157                polars_conn.execute(
7158                    f"""
7159                SELECT transcript
7160                FROM refseq_df
7161                WHERE CHROM='{chr}'
7162                AND POS={pos}
7163            """
7164                )["transcript"]
7165            )
7166
7167            # Full HGVS annotation in list
7168            hgvs_full_list = []
7169
7170            for transcript_name in transcripts_list:
7171
7172                # Transcript
7173                transcript = get_transcript(
7174                    transcripts=transcripts, transcript_name=transcript_name
7175                )
7176                # Exon
7177                if use_exon:
7178                    exon = transcript.find_exon_number(pos)
7179                else:
7180                    exon = None
7181                # Protein
7182                transcript_protein = None
7183                if use_protein or add_protein or full_format:
7184                    transcripts_protein = list(
7185                        polars_conn.execute(
7186                            f"""
7187                        SELECT protein
7188                        FROM refseqlink_df
7189                        WHERE transcript='{transcript_name}'
7190                        LIMIT 1
7191                    """
7192                        )["protein"]
7193                    )
7194                    if len(transcripts_protein):
7195                        transcript_protein = transcripts_protein[0]
7196
7197                # HGVS name
7198                hgvs_name = format_hgvs_name(
7199                    chr,
7200                    pos,
7201                    ref,
7202                    alt,
7203                    genome=genome,
7204                    transcript=transcript,
7205                    transcript_protein=transcript_protein,
7206                    exon=exon,
7207                    use_gene=use_gene,
7208                    use_protein=use_protein,
7209                    full_format=full_format,
7210                    use_version=use_version,
7211                    codon_type=codon_type,
7212                )
7213                hgvs_full_list.append(hgvs_name)
7214                if add_protein and not use_protein and not full_format:
7215                    hgvs_name = format_hgvs_name(
7216                        chr,
7217                        pos,
7218                        ref,
7219                        alt,
7220                        genome=genome,
7221                        transcript=transcript,
7222                        transcript_protein=transcript_protein,
7223                        exon=exon,
7224                        use_gene=use_gene,
7225                        use_protein=True,
7226                        full_format=False,
7227                        use_version=use_version,
7228                        codon_type=codon_type,
7229                    )
7230                    hgvs_full_list.append(hgvs_name)
7231
7232            # Create liste of HGVS annotations
7233            hgvs_full = ",".join(hgvs_full_list)
7234
7235            return hgvs_full
7236
7237        # Polars connexion
7238        polars_conn = pl.SQLContext(register_globals=True, eager=True)
7239
7240        # Config
7241        config = self.get_config()
7242
7243        # Databases
7244        # Genome
7245        databases_genomes_folders = (
7246            config.get("folders", {})
7247            .get("databases", {})
7248            .get("genomes", DEFAULT_GENOME_FOLDER)
7249        )
7250        databases_genome = (
7251            config.get("folders", {}).get("databases", {}).get("genomes", "")
7252        )
7253        # refseq database folder
7254        databases_refseq_folders = (
7255            config.get("folders", {})
7256            .get("databases", {})
7257            .get("refseq", DEFAULT_REFSEQ_FOLDER)
7258        )
7259        # refseq
7260        databases_refseq = config.get("databases", {}).get("refSeq", None)
7261        # refSeqLink
7262        databases_refseqlink = config.get("databases", {}).get("refSeqLink", None)
7263
7264        # Param
7265        param = self.get_param()
7266
7267        # Quick HGVS
7268        if "hgvs_options" in param and param.get("hgvs_options", ""):
7269            log.info(f"Quick HGVS Annotation:")
7270            if not param.get("hgvs", None):
7271                param["hgvs"] = {}
7272            for option in param.get("hgvs_options", "").split(","):
7273                option_var_val = option.split("=")
7274                option_var = option_var_val[0]
7275                if len(option_var_val) > 1:
7276                    option_val = option_var_val[1]
7277                else:
7278                    option_val = "True"
7279                if option_val.upper() in ["TRUE"]:
7280                    option_val = True
7281                elif option_val.upper() in ["FALSE"]:
7282                    option_val = False
7283                log.info(f"   {option_var}={option_val}")
7284                param["hgvs"][option_var] = option_val
7285
7286        # Check if HGVS annotation enabled
7287        if "hgvs" in param:
7288            log.info(f"HGVS Annotation... ")
7289            for hgvs_option in param.get("hgvs", {}):
7290                log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}")
7291        else:
7292            return
7293
7294        # HGVS Param
7295        param_hgvs = param.get("hgvs", {})
7296        use_exon = param_hgvs.get("use_exon", False)
7297        use_gene = param_hgvs.get("use_gene", False)
7298        use_protein = param_hgvs.get("use_protein", False)
7299        add_protein = param_hgvs.get("add_protein", False)
7300        full_format = param_hgvs.get("full_format", False)
7301        use_version = param_hgvs.get("use_version", False)
7302        codon_type = param_hgvs.get("codon_type", "3")
7303
7304        # refSseq refSeqLink
7305        databases_refseq = param_hgvs.get("refseq", databases_refseq)
7306        databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink)
7307
7308        # Assembly
7309        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
7310
7311        # Genome
7312        genome_file = None
7313        if find_genome(databases_genome):
7314            genome_file = find_genome(databases_genome)
7315        else:
7316            genome_file = find_genome(
7317                genome_path=databases_genomes_folders, assembly=assembly
7318            )
7319        log.debug("Genome: " + str(genome_file))
7320
7321        # refSseq
7322        refseq_file = find_file_prefix(
7323            input_file=databases_refseq,
7324            prefix="ncbiRefSeq",
7325            folder=databases_refseq_folders,
7326            assembly=assembly,
7327        )
7328        log.debug("refSeq: " + str(refseq_file))
7329
7330        # refSeqLink
7331        refseqlink_file = find_file_prefix(
7332            input_file=databases_refseqlink,
7333            prefix="ncbiRefSeqLink",
7334            folder=databases_refseq_folders,
7335            assembly=assembly,
7336        )
7337        log.debug("refSeqLink: " + str(refseqlink_file))
7338
7339        # Threads
7340        if not threads:
7341            threads = self.get_threads()
7342        log.debug("Threads: " + str(threads))
7343
7344        # Variables
7345        table_variants = self.get_table_variants(clause="update")
7346
7347        # Get variants SNV and InDel only
7348        query_variants = f"""
7349            SELECT "#CHROM" AS CHROM, POS, REF, ALT
7350            FROM {table_variants}
7351            WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$'
7352            """
7353        df_variants = self.get_query_to_df(query_variants)
7354
7355        # Added columns
7356        added_columns = []
7357
7358        # Add hgvs column in variants table
7359        hgvs_column_name = "hgvs_" + str(random.randrange(1000))
7360        added_column = self.add_column(
7361            table_variants, hgvs_column_name, "STRING", default_value=None
7362        )
7363        added_columns.append(added_column)
7364
7365        log.debug(f"refSeq loading...")
7366        # refSeq in duckDB
7367        refseq_table = get_refseq_table(
7368            conn=self.conn, refseq_table="refseq", refseq_file=refseq_file
7369        )
7370        # Loading all refSeq in Dataframe
7371        refseq_query = f"""
7372            SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript
7373            FROM {refseq_table}
7374            JOIN df_variants ON (
7375                {refseq_table}.chrom = df_variants.CHROM
7376                AND {refseq_table}.txStart<=df_variants.POS
7377                AND {refseq_table}.txEnd>=df_variants.POS
7378            )
7379        """
7380        refseq_df = self.conn.query(refseq_query).pl()
7381
7382        if refseqlink_file:
7383            log.debug(f"refSeqLink loading...")
7384            # refSeqLink in duckDB
7385            refseqlink_table = get_refseq_table(
7386                conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file
7387            )
7388            # Loading all refSeqLink in Dataframe
7389            protacc_column = "protAcc_with_ver"
7390            mrnaacc_column = "mrnaAcc_with_ver"
7391            refseqlink_query = f"""
7392                SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript
7393                FROM {refseqlink_table} 
7394                JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver)
7395                WHERE protAcc_without_ver IS NOT NULL
7396            """
7397            # Polars Dataframe
7398            refseqlink_df = self.conn.query(f"{refseqlink_query}").pl()
7399
7400        # Read RefSeq transcripts into a python dict/model.
7401        log.debug(f"Transcripts loading...")
7402        with tempfile.TemporaryDirectory() as tmpdir:
7403            transcripts_query = f"""
7404                COPY (
7405                    SELECT {refseq_table}.*
7406                    FROM {refseq_table}
7407                    JOIN df_variants ON (
7408                        {refseq_table}.chrom=df_variants.CHROM
7409                        AND {refseq_table}.txStart<=df_variants.POS
7410                        AND {refseq_table}.txEnd>=df_variants.POS
7411                    )
7412                )
7413                TO '{tmpdir}/transcript.tsv' (DELIMITER '\t');
7414            """
7415            self.conn.query(transcripts_query)
7416            with open(f"{tmpdir}/transcript.tsv") as infile:
7417                transcripts = read_transcripts(infile)
7418
7419        # Polars connexion
7420        polars_conn = pl.SQLContext(register_globals=True, eager=True)
7421
7422        log.debug("Genome loading...")
7423        # Read genome sequence using pyfaidx.
7424        genome = Fasta(genome_file)
7425
7426        log.debug("Start annotation HGVS...")
7427
7428        # Create
7429        # a Dask Dataframe from Pandas dataframe with partition as number of threads
7430        ddf = dd.from_pandas(df_variants, npartitions=threads)
7431
7432        # Use dask.dataframe.apply() to apply function on each partition
7433        ddf[hgvs_column_name] = ddf.map_partitions(partition_function)
7434
7435        # Convert Dask DataFrame to Pandas Dataframe
7436        df = ddf.compute()
7437
7438        # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???)
7439        with tempfile.TemporaryDirectory() as tmpdir:
7440            df_parquet = os.path.join(tmpdir, "df.parquet")
7441            df.to_parquet(df_parquet)
7442
7443            # Update hgvs column
7444            update_variant_query = f"""
7445                UPDATE {table_variants}
7446                SET "{hgvs_column_name}"=df."{hgvs_column_name}"
7447                FROM read_parquet('{df_parquet}') as df
7448                WHERE variants."#CHROM" = df.CHROM
7449                AND variants.POS = df.POS
7450                AND variants.REF = df.REF
7451                AND variants.ALT = df.ALT
7452                AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL
7453                """
7454            self.execute_query(update_variant_query)
7455
7456        # Update INFO column
7457        sql_query_update = f"""
7458            UPDATE {table_variants}
7459            SET INFO = 
7460                concat(
7461                    CASE 
7462                        WHEN INFO NOT IN ('','.')
7463                        THEN concat(INFO, ';')
7464                        ELSE ''
7465                    END,
7466                    'hgvs=',
7467                    {hgvs_column_name}
7468                )
7469            WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL
7470            """
7471        self.execute_query(sql_query_update)
7472
7473        # Add header
7474        HGVS_INFOS = {
7475            "hgvs": {
7476                "ID": "hgvs",
7477                "Number": ".",
7478                "Type": "String",
7479                "Description": f"HGVS annotatation with HOWARD",
7480            }
7481        }
7482
7483        for field in HGVS_INFOS:
7484            field_ID = HGVS_INFOS[field]["ID"]
7485            field_description = HGVS_INFOS[field]["Description"]
7486            self.get_header().infos[field_ID] = vcf.parser._Info(
7487                field_ID,
7488                HGVS_INFOS[field]["Number"],
7489                HGVS_INFOS[field]["Type"],
7490                field_description,
7491                "unknown",
7492                "unknown",
7493                code_type_map[HGVS_INFOS[field]["Type"]],
7494            )
7495
7496        # Remove added columns
7497        for added_column in added_columns:
7498            self.drop_column(column=added_column)

The annotation_hgvs function performs HGVS annotation on a set of variants using genomic coordinates and alleles.

Parameters
  • threads: The threads parameter is an optional integer that specifies the number of threads to use for parallel processing. If no value is provided, it will default to the number of threads obtained from the get_threads() method
def get_operations_help( self, operations_config_dict: dict = {}, operations_config_file: str = None) -> list:
7504    def get_operations_help(
7505        self, operations_config_dict: dict = {}, operations_config_file: str = None
7506    ) -> list:
7507
7508        # Init
7509        operations_help = []
7510
7511        # operations
7512        operations = self.get_config_json(
7513            name="calculations",
7514            config_dict=operations_config_dict,
7515            config_file=operations_config_file,
7516        )
7517        for op in operations:
7518            op_name = operations[op].get("name", op).upper()
7519            op_description = operations[op].get("description", op_name)
7520            op_available = operations[op].get("available", False)
7521            if op_available:
7522                operations_help.append(f"   {op_name}: {op_description}")
7523
7524        # Sort operations
7525        operations_help.sort()
7526
7527        # insert header
7528        operations_help.insert(0, "Available calculation operations:")
7529
7530        # Return
7531        return operations_help
def calculation( self, operations: dict = {}, operations_config_dict: dict = {}, operations_config_file: str = None) -> None:
7533    def calculation(
7534        self,
7535        operations: dict = {},
7536        operations_config_dict: dict = {},
7537        operations_config_file: str = None,
7538    ) -> None:
7539        """
7540        It takes a list of operations, and for each operation, it checks if it's a python or sql
7541        operation, and then calls the appropriate function
7542
7543        param json example:
7544            "calculation": {
7545                "NOMEN": {
7546                    "options": {
7547                        "hgvs_field": "hgvs"
7548                    },
7549                "middle" : null
7550            }
7551        """
7552
7553        # Param
7554        param = self.get_param()
7555
7556        # operations config
7557        operations_config = self.get_config_json(
7558            name="calculations",
7559            config_dict=operations_config_dict,
7560            config_file=operations_config_file,
7561        )
7562
7563        # Upper keys
7564        operations_config = {k.upper(): v for k, v in operations_config.items()}
7565
7566        # Calculations
7567
7568        # Operations from param
7569        operations = param.get("calculation", {}).get("calculations", operations)
7570
7571        # Quick calculation - add
7572        if param.get("calculations", None):
7573            calculations_list = [
7574                value for value in param.get("calculations", "").split(",")
7575            ]
7576            log.info(f"Quick Calculations:")
7577            for calculation_key in calculations_list:
7578                log.info(f"   {calculation_key}")
7579            for calculation_operation in calculations_list:
7580                if calculation_operation.upper() not in operations:
7581                    operations[calculation_operation.upper()] = {}
7582                    add_value_into_dict(
7583                        dict_tree=param,
7584                        sections=[
7585                            "calculation",
7586                            "calculations",
7587                            calculation_operation.upper(),
7588                        ],
7589                        value={},
7590                    )
7591
7592        # Operations for calculation
7593        if not operations:
7594            operations = param.get("calculation", {}).get("calculations", {})
7595
7596        if operations:
7597            log.info(f"Calculations...")
7598
7599        # For each operations
7600        for operation_name in operations:
7601            operation_name = operation_name.upper()
7602            if operation_name not in [""]:
7603                if operation_name in operations_config:
7604                    log.info(f"Calculation '{operation_name}'")
7605                    operation = operations_config[operation_name]
7606                    operation_type = operation.get("type", "sql")
7607                    if operation_type == "python":
7608                        self.calculation_process_function(
7609                            operation=operation, operation_name=operation_name
7610                        )
7611                    elif operation_type == "sql":
7612                        self.calculation_process_sql(
7613                            operation=operation, operation_name=operation_name
7614                        )
7615                    else:
7616                        log.error(
7617                            f"Operations config: Type '{operation_type}' NOT available"
7618                        )
7619                        raise ValueError(
7620                            f"Operations config: Type '{operation_type}' NOT available"
7621                        )
7622                else:
7623                    log.error(
7624                        f"Operations config: Calculation '{operation_name}' NOT available"
7625                    )
7626                    raise ValueError(
7627                        f"Operations config: Calculation '{operation_name}' NOT available"
7628                    )
7629
7630        # Explode INFOS fields into table fields
7631        if self.get_explode_infos():
7632            self.explode_infos(
7633                prefix=self.get_explode_infos_prefix(),
7634                fields=self.get_explode_infos_fields(),
7635                force=True,
7636            )

It takes a list of operations, and for each operation, it checks if it's a python or sql operation, and then calls the appropriate function

param json example: "calculation": { "NOMEN": { "options": { "hgvs_field": "hgvs" }, "middle" : null }

def calculation_process_sql(self, operation: dict, operation_name: str = 'unknown') -> None:
7638    def calculation_process_sql(
7639        self, operation: dict, operation_name: str = "unknown"
7640    ) -> None:
7641        """
7642        The `calculation_process_sql` function takes in a mathematical operation as a string and
7643        performs the operation, updating the specified table with the result.
7644
7645        :param operation: The `operation` parameter is a dictionary that contains information about the
7646        mathematical operation to be performed. It includes the following keys:
7647        :type operation: dict
7648        :param operation_name: The `operation_name` parameter is a string that represents the name of
7649        the mathematical operation being performed. It is used for logging and error handling purposes,
7650        defaults to unknown
7651        :type operation_name: str (optional)
7652        """
7653
7654        # table variants
7655        table_variants = self.get_table_variants(clause="alter")
7656
7657        # Operation infos
7658        operation_name = operation.get("name", "unknown")
7659        log.debug(f"process sql {operation_name}")
7660        output_column_name = operation.get("output_column_name", operation_name)
7661        output_column_type = operation.get("output_column_type", "String")
7662        prefix = operation.get("explode_infos_prefix", "")
7663        output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR")
7664        output_column_description = operation.get(
7665            "output_column_description", f"{operation_name} operation"
7666        )
7667        operation_query = operation.get("operation_query", None)
7668        if isinstance(operation_query, list):
7669            operation_query = " ".join(operation_query)
7670        operation_info_fields = operation.get("info_fields", [])
7671        operation_info_fields_check = operation.get("info_fields_check", False)
7672        operation_info = operation.get("operation_info", True)
7673
7674        if operation_query:
7675
7676            # Info fields check
7677            operation_info_fields_check_result = True
7678            if operation_info_fields_check:
7679                header_infos = self.get_header().infos
7680                for info_field in operation_info_fields:
7681                    operation_info_fields_check_result = (
7682                        operation_info_fields_check_result
7683                        and info_field in header_infos
7684                    )
7685
7686            # If info fields available
7687            if operation_info_fields_check_result:
7688
7689                # Added_columns
7690                added_columns = []
7691
7692                # Create VCF header field
7693                vcf_reader = self.get_header()
7694                vcf_reader.infos[output_column_name] = vcf.parser._Info(
7695                    output_column_name,
7696                    ".",
7697                    output_column_type,
7698                    output_column_description,
7699                    "howard calculation",
7700                    "0",
7701                    self.code_type_map.get(output_column_type),
7702                )
7703
7704                # Explode infos if needed
7705                log.debug(f"calculation_process_sql prefix {prefix}")
7706                added_columns += self.explode_infos(
7707                    prefix=prefix,
7708                    fields=[output_column_name] + operation_info_fields,
7709                    force=True,
7710                )
7711
7712                # Create column
7713                added_column = self.add_column(
7714                    table_name=table_variants,
7715                    column_name=prefix + output_column_name,
7716                    column_type=output_column_type_sql,
7717                    default_value="null",
7718                )
7719                added_columns.append(added_column)
7720
7721                # Operation calculation
7722                try:
7723
7724                    # Query to update calculation column
7725                    sql_update = f"""
7726                        UPDATE {table_variants}
7727                        SET "{prefix}{output_column_name}" = ({operation_query})
7728                    """
7729                    self.conn.execute(sql_update)
7730
7731                    # Add to INFO
7732                    if operation_info:
7733                        sql_update_info = f"""
7734                            UPDATE {table_variants}
7735                            SET "INFO" =
7736                                concat(
7737                                    CASE
7738                                        WHEN "INFO" IS NOT NULL
7739                                        THEN concat("INFO", ';')
7740                                        ELSE ''
7741                                    END,
7742                                    '{output_column_name}=',
7743                                    "{prefix}{output_column_name}"
7744                                )
7745                            WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('')
7746                        """
7747                        self.conn.execute(sql_update_info)
7748
7749                except:
7750                    log.error(
7751                        f"Operations config: Calculation '{operation_name}' query failed"
7752                    )
7753                    raise ValueError(
7754                        f"Operations config: Calculation '{operation_name}' query failed"
7755                    )
7756
7757                # Remove added columns
7758                for added_column in added_columns:
7759                    log.debug(f"added_column: {added_column}")
7760                    self.drop_column(column=added_column)
7761
7762            else:
7763                log.error(
7764                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
7765                )
7766                raise ValueError(
7767                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
7768                )
7769
7770        else:
7771            log.error(
7772                f"Operations config: Calculation '{operation_name}' query NOT defined"
7773            )
7774            raise ValueError(
7775                f"Operations config: Calculation '{operation_name}' query NOT defined"
7776            )

The calculation_process_sql function takes in a mathematical operation as a string and performs the operation, updating the specified table with the result.

Parameters
  • operation: The operation parameter is a dictionary that contains information about the mathematical operation to be performed. It includes the following keys:
  • operation_name: The operation_name parameter is a string that represents the name of the mathematical operation being performed. It is used for logging and error handling purposes, defaults to unknown
def calculation_process_function(self, operation: dict, operation_name: str = 'unknown') -> None:
7778    def calculation_process_function(
7779        self, operation: dict, operation_name: str = "unknown"
7780    ) -> None:
7781        """
7782        The `calculation_process_function` takes in an operation dictionary and performs the specified
7783        function with the given parameters.
7784
7785        :param operation: The `operation` parameter is a dictionary that contains information about the
7786        operation to be performed. It has the following keys:
7787        :type operation: dict
7788        :param operation_name: The `operation_name` parameter is a string that represents the name of
7789        the operation being performed. It is used for logging purposes, defaults to unknown
7790        :type operation_name: str (optional)
7791        """
7792
7793        operation_name = operation["name"]
7794        log.debug(f"process sql {operation_name}")
7795        function_name = operation["function_name"]
7796        function_params = operation["function_params"]
7797        getattr(self, function_name)(*function_params)

The calculation_process_function takes in an operation dictionary and performs the specified function with the given parameters.

Parameters
  • operation: The operation parameter is a dictionary that contains information about the operation to be performed. It has the following keys:
  • operation_name: The operation_name parameter is a string that represents the name of the operation being performed. It is used for logging purposes, defaults to unknown
def calculation_variant_id(self) -> None:
7799    def calculation_variant_id(self) -> None:
7800        """
7801        The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and
7802        updates the INFO field of a variants table with the variant ID.
7803        """
7804
7805        # variant_id annotation field
7806        variant_id_tag = self.get_variant_id_column()
7807        added_columns = [variant_id_tag]
7808
7809        # variant_id hgvs tags"
7810        vcf_infos_tags = {
7811            variant_id_tag: "howard variant ID annotation",
7812        }
7813
7814        # Variants table
7815        table_variants = self.get_table_variants()
7816
7817        # Header
7818        vcf_reader = self.get_header()
7819
7820        # Add variant_id to header
7821        vcf_reader.infos[variant_id_tag] = vcf.parser._Info(
7822            variant_id_tag,
7823            ".",
7824            "String",
7825            vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"),
7826            "howard calculation",
7827            "0",
7828            self.code_type_map.get("String"),
7829        )
7830
7831        # Update
7832        sql_update = f"""
7833            UPDATE {table_variants}
7834            SET "INFO" = 
7835                concat(
7836                    CASE
7837                        WHEN "INFO" IS NULL OR "INFO" IN ('','.')
7838                        THEN ''
7839                        ELSE concat("INFO", ';')
7840                    END,
7841                    '{variant_id_tag}=',
7842                    "{variant_id_tag}"
7843                )
7844        """
7845        self.conn.execute(sql_update)
7846
7847        # Remove added columns
7848        for added_column in added_columns:
7849            self.drop_column(column=added_column)

The function calculation_variant_id adds a variant ID annotation to a VCF file header and updates the INFO field of a variants table with the variant ID.

def calculation_extract_snpeff_hgvs( self, snpeff_hgvs: str = 'snpeff_hgvs', snpeff_field: str = 'ANN') -> None:
7851    def calculation_extract_snpeff_hgvs(
7852        self,
7853        snpeff_hgvs: str = "snpeff_hgvs",
7854        snpeff_field: str = "ANN",
7855    ) -> None:
7856        """
7857        The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff
7858        annotation field in a VCF file and adds them as a new column in the variants table.
7859
7860        :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs`
7861        function is used to specify the name of the column that will store the HGVS nomenclatures
7862        extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to
7863        snpeff_hgvs
7864        :type snpeff_hgvs: str (optional)
7865        :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs`
7866        function represents the field in the VCF file that contains SnpEff annotations. This field is
7867        used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults
7868        to ANN
7869        :type snpeff_field: str (optional)
7870        """
7871
7872        # Snpeff hgvs tags
7873        vcf_infos_tags = {
7874            snpeff_hgvs: "HGVS nomenclatures from snpEff annotation",
7875        }
7876
7877        # Prefix
7878        prefix = self.get_explode_infos_prefix()
7879        if prefix:
7880            prefix = "INFO/"
7881
7882        # snpEff fields
7883        speff_ann_infos = prefix + snpeff_field
7884        speff_hgvs_infos = prefix + snpeff_hgvs
7885
7886        # Variants table
7887        table_variants = self.get_table_variants()
7888
7889        # Header
7890        vcf_reader = self.get_header()
7891
7892        # Add columns
7893        added_columns = []
7894
7895        # Explode HGVS field in column
7896        added_columns += self.explode_infos(fields=[snpeff_field])
7897
7898        if snpeff_field in vcf_reader.infos:
7899
7900            log.debug(vcf_reader.infos[snpeff_field])
7901
7902            # Extract ANN header
7903            ann_description = vcf_reader.infos[snpeff_field].desc
7904            pattern = r"'(.+?)'"
7905            match = re.search(pattern, ann_description)
7906            if match:
7907                ann_header_match = match.group(1).split(" | ")
7908                ann_header_desc = {}
7909                for i in range(len(ann_header_match)):
7910                    ann_header_info = "".join(
7911                        char for char in ann_header_match[i] if char.isalnum()
7912                    )
7913                    ann_header_desc[ann_header_info] = ann_header_match[i]
7914                if not ann_header_desc:
7915                    raise ValueError("Invalid header description format")
7916            else:
7917                raise ValueError("Invalid header description format")
7918
7919            # Create variant id
7920            variant_id_column = self.get_variant_id_column()
7921            added_columns += [variant_id_column]
7922
7923            # Create dataframe
7924            dataframe_snpeff_hgvs = self.get_query_to_df(
7925                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
7926            )
7927
7928            # Create main NOMEN column
7929            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
7930                speff_ann_infos
7931            ].apply(
7932                lambda x: extract_snpeff_hgvs(
7933                    str(x), header=list(ann_header_desc.values())
7934                )
7935            )
7936
7937            # Add snpeff_hgvs to header
7938            vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info(
7939                snpeff_hgvs,
7940                ".",
7941                "String",
7942                vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"),
7943                "howard calculation",
7944                "0",
7945                self.code_type_map.get("String"),
7946            )
7947
7948            # Update
7949            sql_update = f"""
7950                UPDATE variants
7951                SET "INFO" = 
7952                    concat(
7953                        CASE
7954                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
7955                            THEN ''
7956                            ELSE concat("INFO", ';')
7957                        END,
7958                        CASE 
7959                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
7960                            AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
7961                            THEN concat(
7962                                    '{snpeff_hgvs}=',
7963                                    dataframe_snpeff_hgvs."{speff_hgvs_infos}"
7964                                )
7965                            ELSE ''
7966                        END
7967                    )
7968                FROM dataframe_snpeff_hgvs
7969                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
7970
7971            """
7972            self.conn.execute(sql_update)
7973
7974            # Delete dataframe
7975            del dataframe_snpeff_hgvs
7976            gc.collect()
7977
7978        else:
7979
7980            log.warning(
7981                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
7982            )
7983
7984        # Remove added columns
7985        for added_column in added_columns:
7986            self.drop_column(column=added_column)

The function calculation_extract_snpeff_hgvs extracts HGVS nomenclatures from the SnpEff annotation field in a VCF file and adds them as a new column in the variants table.

Parameters
  • snpeff_hgvs: The snpeff_hgvs parameter in the calculation_extract_snpeff_hgvs function is used to specify the name of the column that will store the HGVS nomenclatures extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to snpeff_hgvs
  • snpeff_field: The snpeff_field parameter in the calculation_extract_snpeff_hgvs function represents the field in the VCF file that contains SnpEff annotations. This field is used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults to ANN
def calculation_snpeff_ann_explode( self, uniquify: bool = True, output_format: str = 'fields', output_prefix: str = 'snpeff_', snpeff_field: str = 'ANN') -> None:
7988    def calculation_snpeff_ann_explode(
7989        self,
7990        uniquify: bool = True,
7991        output_format: str = "fields",
7992        output_prefix: str = "snpeff_",
7993        snpeff_field: str = "ANN",
7994    ) -> None:
7995        """
7996        The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by
7997        exploding the HGVS field and updating variant information accordingly.
7998
7999        :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a
8000        boolean flag that determines whether the output should be uniquified or not. When set to `True`,
8001        it indicates that the output should be unique, meaning that duplicate entries should be removed,
8002        defaults to True
8003        :type uniquify: bool (optional)
8004        :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode`
8005        function specifies the format in which the output annotations will be generated. It has a
8006        default value of "fields". You can also set it to "JSON" to output the annotations in JSON
8007        format, defaults to fields
8008        :type output_format: str (optional)
8009        :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode`
8010        method is used to specify the prefix that will be added to the output annotations generated
8011        during the calculation process. This prefix helps to differentiate the newly added annotations
8012        from existing ones in the output data. By default, the, defaults to ANN_
8013        :type output_prefix: str (optional)
8014        :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode`
8015        function is used to specify the field in the VCF file that contains SnpEff annotations. This
8016        field will be processed to explode the HGVS annotations and update the variant information
8017        accordingly, defaults to ANN
8018        :type snpeff_field: str (optional)
8019        """
8020
8021        # SnpEff annotation field
8022        snpeff_hgvs = "snpeff_ann_explode"
8023
8024        # Snpeff hgvs tags
8025        vcf_infos_tags = {
8026            snpeff_hgvs: "Explode snpEff annotations",
8027        }
8028
8029        # Prefix
8030        prefix = self.get_explode_infos_prefix()
8031        if prefix:
8032            prefix = "INFO/"
8033
8034        # snpEff fields
8035        speff_ann_infos = prefix + snpeff_field
8036        speff_hgvs_infos = prefix + snpeff_hgvs
8037
8038        # Variants table
8039        table_variants = self.get_table_variants()
8040
8041        # Header
8042        vcf_reader = self.get_header()
8043
8044        # Add columns
8045        added_columns = []
8046
8047        # Explode HGVS field in column
8048        added_columns += self.explode_infos(fields=[snpeff_field])
8049        log.debug(f"snpeff_field={snpeff_field}")
8050        log.debug(f"added_columns={added_columns}")
8051
8052        if snpeff_field in vcf_reader.infos:
8053
8054            # Extract ANN header
8055            ann_description = vcf_reader.infos[snpeff_field].desc
8056            pattern = r"'(.+?)'"
8057            match = re.search(pattern, ann_description)
8058            if match:
8059                ann_header_match = match.group(1).split(" | ")
8060                ann_header = []
8061                ann_header_desc = {}
8062                for i in range(len(ann_header_match)):
8063                    ann_header_info = "".join(
8064                        char for char in ann_header_match[i] if char.isalnum()
8065                    )
8066                    ann_header.append(ann_header_info)
8067                    ann_header_desc[ann_header_info] = ann_header_match[i]
8068                if not ann_header_desc:
8069                    raise ValueError("Invalid header description format")
8070            else:
8071                raise ValueError("Invalid header description format")
8072
8073            # Create variant id
8074            variant_id_column = self.get_variant_id_column()
8075            added_columns += [variant_id_column]
8076
8077            # Create dataframe
8078            dataframe_snpeff_hgvs = self.get_query_to_df(
8079                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
8080            )
8081
8082            # Create snpEff columns
8083            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
8084                speff_ann_infos
8085            ].apply(
8086                lambda x: explode_snpeff_ann(
8087                    str(x),
8088                    uniquify=uniquify,
8089                    output_format=output_format,
8090                    prefix=output_prefix,
8091                    header=list(ann_header_desc.values()),
8092                )
8093            )
8094
8095            # Header
8096            ann_annotations_prefix = ""
8097            if output_format.upper() in ["JSON"]:
8098                ann_annotations_prefix = f"{output_prefix}="
8099                vcf_reader.infos[output_prefix] = vcf.parser._Info(
8100                    output_prefix,
8101                    ".",
8102                    "String",
8103                    vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
8104                    + " - JSON format",
8105                    "howard calculation",
8106                    "0",
8107                    self.code_type_map.get("String"),
8108                )
8109            else:
8110                for ann_annotation in ann_header:
8111                    ann_annotation_id = f"{output_prefix}{ann_annotation}"
8112                    vcf_reader.infos[ann_annotation_id] = vcf.parser._Info(
8113                        ann_annotation_id,
8114                        ".",
8115                        "String",
8116                        vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
8117                        + f" - '{ann_header_desc[ann_annotation]}' annotation",
8118                        "howard calculation",
8119                        "0",
8120                        self.code_type_map.get("String"),
8121                    )
8122
8123            # Update
8124            sql_update = f"""
8125                UPDATE variants
8126                SET "INFO" = 
8127                    concat(
8128                        CASE
8129                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8130                            THEN ''
8131                            ELSE concat("INFO", ';')
8132                        END,
8133                        CASE 
8134                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
8135                                AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
8136                            THEN concat(
8137                                '{ann_annotations_prefix}',
8138                                dataframe_snpeff_hgvs."{speff_hgvs_infos}"
8139                                )
8140                            ELSE ''
8141                        END
8142                    )
8143                FROM dataframe_snpeff_hgvs
8144                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
8145
8146            """
8147            self.conn.execute(sql_update)
8148
8149            # Delete dataframe
8150            del dataframe_snpeff_hgvs
8151            gc.collect()
8152
8153        else:
8154
8155            log.warning(
8156                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
8157            )
8158
8159        # Remove added columns
8160        for added_column in added_columns:
8161            self.drop_column(column=added_column)

The calculation_snpeff_ann_explode function processes SnpEff annotations in a VCF file by exploding the HGVS field and updating variant information accordingly.

Parameters
  • uniquify: The uniquify parameter in the calculation_snpeff_ann_explode method is a boolean flag that determines whether the output should be uniquified or not. When set to True, it indicates that the output should be unique, meaning that duplicate entries should be removed, defaults to True
  • output_format: The output_format parameter in the calculation_snpeff_ann_explode function specifies the format in which the output annotations will be generated. It has a default value of "fields". You can also set it to "JSON" to output the annotations in JSON format, defaults to fields
  • output_prefix: The output_prefix parameter in the calculation_snpeff_ann_explode method is used to specify the prefix that will be added to the output annotations generated during the calculation process. This prefix helps to differentiate the newly added annotations from existing ones in the output data. By default, the, defaults to ANN_
  • snpeff_field: The snpeff_field parameter in the calculation_snpeff_ann_explode function is used to specify the field in the VCF file that contains SnpEff annotations. This field will be processed to explode the HGVS annotations and update the variant information accordingly, defaults to ANN
def calculation_extract_nomen(self) -> None:
8163    def calculation_extract_nomen(self) -> None:
8164        """
8165        This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.
8166        """
8167
8168        # NOMEN field
8169        field_nomen_dict = "NOMEN_DICT"
8170
8171        # NOMEN structure
8172        nomen_dict = {
8173            "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)",
8174            "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)",
8175            "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)",
8176            "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant",
8177            "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)",
8178            "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)",
8179            "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)",
8180            "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)",
8181            "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)",
8182            "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)",
8183        }
8184
8185        # Param
8186        param = self.get_param()
8187
8188        # Prefix
8189        prefix = self.get_explode_infos_prefix()
8190
8191        # Header
8192        vcf_reader = self.get_header()
8193
8194        # Get HGVS field
8195        hgvs_field = (
8196            param.get("calculation", {})
8197            .get("calculations", {})
8198            .get("NOMEN", {})
8199            .get("options", {})
8200            .get("hgvs_field", "hgvs")
8201        )
8202
8203        # Get transcripts
8204        transcripts_file = (
8205            param.get("calculation", {})
8206            .get("calculations", {})
8207            .get("NOMEN", {})
8208            .get("options", {})
8209            .get("transcripts", None)
8210        )
8211        transcripts_file = full_path(transcripts_file)
8212        transcripts = []
8213        if transcripts_file:
8214            if os.path.exists(transcripts_file):
8215                transcripts_dataframe = transcripts_file_to_df(transcripts_file)
8216                transcripts = transcripts_dataframe.iloc[:, 0].tolist()
8217            else:
8218                log.error(f"Transcript file '{transcripts_file}' does NOT exist")
8219                raise ValueError(f"Transcript file '{transcripts_file}' does NOT exist")
8220
8221        # Added columns
8222        added_columns = []
8223
8224        # Explode HGVS field in column
8225        added_columns += self.explode_infos(fields=[hgvs_field])
8226
8227        # extra infos
8228        extra_infos = self.get_extra_infos()
8229        extra_field = prefix + hgvs_field
8230
8231        if extra_field in extra_infos:
8232
8233            # Create dataframe
8234            dataframe_hgvs = self.get_query_to_df(
8235                f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" FROM variants """
8236            )
8237
8238            # Create main NOMEN column
8239            dataframe_hgvs[field_nomen_dict] = dataframe_hgvs[extra_field].apply(
8240                lambda x: find_nomen(str(x), transcripts=transcripts)
8241            )
8242
8243            # Explode NOMEN Structure and create SQL set for update
8244            sql_nomen_fields = []
8245            for nomen_field in nomen_dict:
8246
8247                # Explode each field into a column
8248                dataframe_hgvs[nomen_field] = dataframe_hgvs[field_nomen_dict].apply(
8249                    lambda x: dict(x).get(nomen_field, "")
8250                )
8251
8252                # Create VCF header field
8253                vcf_reader.infos[nomen_field] = vcf.parser._Info(
8254                    nomen_field,
8255                    ".",
8256                    "String",
8257                    nomen_dict.get(nomen_field, "howard calculation NOMEN"),
8258                    "howard calculation",
8259                    "0",
8260                    self.code_type_map.get("String"),
8261                )
8262                sql_nomen_fields.append(
8263                    f"""
8264                        CASE 
8265                            WHEN dataframe_hgvs."{nomen_field}" NOT NULL AND dataframe_hgvs."{nomen_field}" NOT IN ('')
8266                            THEN concat(
8267                                    ';{nomen_field}=',
8268                                    dataframe_hgvs."{nomen_field}"
8269                                )
8270                            ELSE ''
8271                        END
8272                    """
8273                )
8274
8275            # SQL set for update
8276            sql_nomen_fields_set = ", ".join(sql_nomen_fields)
8277
8278            # Update
8279            sql_update = f"""
8280                UPDATE variants
8281                SET "INFO" = 
8282                    concat(
8283                        CASE
8284                            WHEN "INFO" IS NULL
8285                            THEN ''
8286                            ELSE "INFO"
8287                        END,
8288                        {sql_nomen_fields_set}
8289                    )
8290                FROM dataframe_hgvs
8291                WHERE variants."#CHROM" = dataframe_hgvs."#CHROM"
8292                    AND variants."POS" = dataframe_hgvs."POS" 
8293                    AND variants."REF" = dataframe_hgvs."REF"
8294                    AND variants."ALT" = dataframe_hgvs."ALT"
8295            """
8296            self.conn.execute(sql_update)
8297
8298            # Delete dataframe
8299            del dataframe_hgvs
8300            gc.collect()
8301
8302        # Remove added columns
8303        for added_column in added_columns:
8304            self.drop_column(column=added_column)

This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.

def calculation_find_by_pipeline(self, tag: str = 'findbypipeline') -> None:
8306    def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None:
8307        """
8308        The function `calculation_find_by_pipeline` performs a calculation to find the number of
8309        pipeline/sample for a variant and updates the variant information in a VCF file.
8310
8311        :param tag: The `tag` parameter is a string that represents the annotation field for the
8312        "findbypipeline" information in the VCF file. It is used to create the annotation field in the
8313        VCF header and to update the corresponding field in the variants table, defaults to
8314        findbypipeline
8315        :type tag: str (optional)
8316        """
8317
8318        # if FORMAT and samples
8319        if (
8320            "FORMAT" in self.get_header_columns_as_list()
8321            and self.get_header_sample_list()
8322        ):
8323
8324            # findbypipeline annotation field
8325            findbypipeline_tag = tag
8326
8327            # VCF infos tags
8328            vcf_infos_tags = {
8329                findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})",
8330            }
8331
8332            # Prefix
8333            prefix = self.get_explode_infos_prefix()
8334
8335            # Field
8336            findbypipeline_infos = prefix + findbypipeline_tag
8337
8338            # Variants table
8339            table_variants = self.get_table_variants()
8340
8341            # Header
8342            vcf_reader = self.get_header()
8343
8344            # Create variant id
8345            variant_id_column = self.get_variant_id_column()
8346            added_columns = [variant_id_column]
8347
8348            # variant_id, FORMAT and samples
8349            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8350                self.get_header_sample_list()
8351            )
8352
8353            # Create dataframe
8354            dataframe_findbypipeline = self.get_query_to_df(
8355                f""" SELECT {samples_fields} FROM {table_variants} """
8356            )
8357
8358            # Create findbypipeline column
8359            dataframe_findbypipeline[findbypipeline_infos] = (
8360                dataframe_findbypipeline.apply(
8361                    lambda row: findbypipeline(
8362                        row, samples=self.get_header_sample_list()
8363                    ),
8364                    axis=1,
8365                )
8366            )
8367
8368            # Add snpeff_hgvs to header
8369            vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info(
8370                findbypipeline_tag,
8371                ".",
8372                "String",
8373                vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"),
8374                "howard calculation",
8375                "0",
8376                self.code_type_map.get("String"),
8377            )
8378
8379            # Update
8380            sql_update = f"""
8381                UPDATE variants
8382                SET "INFO" = 
8383                    concat(
8384                        CASE
8385                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8386                            THEN ''
8387                            ELSE concat("INFO", ';')
8388                        END,
8389                        CASE 
8390                            WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.')
8391                                AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL
8392                            THEN concat(
8393                                    '{findbypipeline_tag}=',
8394                                    dataframe_findbypipeline."{findbypipeline_infos}"
8395                                )
8396                            ELSE ''
8397                        END
8398                    )
8399                FROM dataframe_findbypipeline
8400                WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}"
8401            """
8402            self.conn.execute(sql_update)
8403
8404            # Remove added columns
8405            for added_column in added_columns:
8406                self.drop_column(column=added_column)
8407
8408            # Delete dataframe
8409            del dataframe_findbypipeline
8410            gc.collect()

The function calculation_find_by_pipeline performs a calculation to find the number of pipeline/sample for a variant and updates the variant information in a VCF file.

Parameters
  • tag: The tag parameter is a string that represents the annotation field for the "findbypipeline" information in the VCF file. It is used to create the annotation field in the VCF header and to update the corresponding field in the variants table, defaults to findbypipeline
def calculation_genotype_concordance(self) -> None:
8412    def calculation_genotype_concordance(self) -> None:
8413        """
8414        The function `calculation_genotype_concordance` calculates the genotype concordance for
8415        multi-caller VCF files and updates the variant information in the database.
8416        """
8417
8418        # if FORMAT and samples
8419        if (
8420            "FORMAT" in self.get_header_columns_as_list()
8421            and self.get_header_sample_list()
8422        ):
8423
8424            # genotypeconcordance annotation field
8425            genotypeconcordance_tag = "genotypeconcordance"
8426
8427            # VCF infos tags
8428            vcf_infos_tags = {
8429                genotypeconcordance_tag: "Concordance of genotype for multi caller VCF",
8430            }
8431
8432            # Prefix
8433            prefix = self.get_explode_infos_prefix()
8434
8435            # Field
8436            genotypeconcordance_infos = prefix + genotypeconcordance_tag
8437
8438            # Variants table
8439            table_variants = self.get_table_variants()
8440
8441            # Header
8442            vcf_reader = self.get_header()
8443
8444            # Create variant id
8445            variant_id_column = self.get_variant_id_column()
8446            added_columns = [variant_id_column]
8447
8448            # variant_id, FORMAT and samples
8449            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8450                self.get_header_sample_list()
8451            )
8452
8453            # Create dataframe
8454            dataframe_genotypeconcordance = self.get_query_to_df(
8455                f""" SELECT {samples_fields} FROM {table_variants} """
8456            )
8457
8458            # Create genotypeconcordance column
8459            dataframe_genotypeconcordance[genotypeconcordance_infos] = (
8460                dataframe_genotypeconcordance.apply(
8461                    lambda row: genotypeconcordance(
8462                        row, samples=self.get_header_sample_list()
8463                    ),
8464                    axis=1,
8465                )
8466            )
8467
8468            # Add genotypeconcordance to header
8469            vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info(
8470                genotypeconcordance_tag,
8471                ".",
8472                "String",
8473                vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"),
8474                "howard calculation",
8475                "0",
8476                self.code_type_map.get("String"),
8477            )
8478
8479            # Update
8480            sql_update = f"""
8481                UPDATE variants
8482                SET "INFO" = 
8483                    concat(
8484                        CASE
8485                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8486                            THEN ''
8487                            ELSE concat("INFO", ';')
8488                        END,
8489                        CASE
8490                            WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.')
8491                                AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL
8492                            THEN concat(
8493                                    '{genotypeconcordance_tag}=',
8494                                    dataframe_genotypeconcordance."{genotypeconcordance_infos}"
8495                                )
8496                            ELSE ''
8497                        END
8498                    )
8499                FROM dataframe_genotypeconcordance
8500                WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}"
8501            """
8502            self.conn.execute(sql_update)
8503
8504            # Remove added columns
8505            for added_column in added_columns:
8506                self.drop_column(column=added_column)
8507
8508            # Delete dataframe
8509            del dataframe_genotypeconcordance
8510            gc.collect()

The function calculation_genotype_concordance calculates the genotype concordance for multi-caller VCF files and updates the variant information in the database.

def calculation_barcode(self, tag: str = 'barcode') -> None:
8512    def calculation_barcode(self, tag: str = "barcode") -> None:
8513        """
8514        The `calculation_barcode` function calculates barcode values for variants in a VCF file and
8515        updates the INFO field in the file with the calculated barcode values.
8516
8517        :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag
8518        name that will be used for the barcode calculation in the VCF file. If no tag name is provided,
8519        the default tag name is set to "barcode", defaults to barcode
8520        :type tag: str (optional)
8521        """
8522
8523        # if FORMAT and samples
8524        if (
8525            "FORMAT" in self.get_header_columns_as_list()
8526            and self.get_header_sample_list()
8527        ):
8528
8529            # barcode annotation field
8530            if not tag:
8531                tag = "barcode"
8532
8533            # VCF infos tags
8534            vcf_infos_tags = {
8535                tag: "barcode calculation (VaRank)",
8536            }
8537
8538            # Prefix
8539            prefix = self.get_explode_infos_prefix()
8540
8541            # Field
8542            barcode_infos = prefix + tag
8543
8544            # Variants table
8545            table_variants = self.get_table_variants()
8546
8547            # Header
8548            vcf_reader = self.get_header()
8549
8550            # Create variant id
8551            variant_id_column = self.get_variant_id_column()
8552            added_columns = [variant_id_column]
8553
8554            # variant_id, FORMAT and samples
8555            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8556                self.get_header_sample_list()
8557            )
8558
8559            # Create dataframe
8560            dataframe_barcode = self.get_query_to_df(
8561                f""" SELECT {samples_fields} FROM {table_variants} """
8562            )
8563
8564            # Create barcode column
8565            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
8566                lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1
8567            )
8568
8569            # Add barcode to header
8570            vcf_reader.infos[tag] = vcf.parser._Info(
8571                tag,
8572                ".",
8573                "String",
8574                vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)),
8575                "howard calculation",
8576                "0",
8577                self.code_type_map.get("String"),
8578            )
8579
8580            # Update
8581            sql_update = f"""
8582                UPDATE {table_variants}
8583                SET "INFO" = 
8584                    concat(
8585                        CASE
8586                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8587                            THEN ''
8588                            ELSE concat("INFO", ';')
8589                        END,
8590                        CASE
8591                            WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.')
8592                            AND dataframe_barcode."{barcode_infos}" NOT NULL
8593                            THEN concat(
8594                                    '{tag}=',
8595                                    dataframe_barcode."{barcode_infos}"
8596                                )
8597                            ELSE ''
8598                        END
8599                    )
8600                FROM dataframe_barcode
8601                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
8602            """
8603            self.conn.execute(sql_update)
8604
8605            # Remove added columns
8606            for added_column in added_columns:
8607                self.drop_column(column=added_column)
8608
8609            # Delete dataframe
8610            del dataframe_barcode
8611            gc.collect()

The calculation_barcode function calculates barcode values for variants in a VCF file and updates the INFO field in the file with the calculated barcode values.

Parameters
  • tag: The tag parameter in the calculation_barcode function is used to specify the tag name that will be used for the barcode calculation in the VCF file. If no tag name is provided, the default tag name is set to "barcode", defaults to barcode
def calculation_barcode_family(self, tag: str = 'BCF') -> None:
8613    def calculation_barcode_family(self, tag: str = "BCF") -> None:
8614        """
8615        The `calculation_barcode_family` function calculates barcode values for variants in a VCF file
8616        and updates the INFO field in the file with the calculated barcode values.
8617
8618        :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify
8619        the barcode tag that will be added to the VCF file during the calculation process. If no value
8620        is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF
8621        :type tag: str (optional)
8622        """
8623
8624        # if FORMAT and samples
8625        if (
8626            "FORMAT" in self.get_header_columns_as_list()
8627            and self.get_header_sample_list()
8628        ):
8629
8630            # barcode annotation field
8631            if not tag:
8632                tag = "BCF"
8633
8634            # VCF infos tags
8635            vcf_infos_tags = {
8636                tag: "barcode family calculation",
8637                f"{tag}S": "barcode family samples",
8638            }
8639
8640            # Param
8641            param = self.get_param()
8642            log.debug(f"param={param}")
8643
8644            # Prefix
8645            prefix = self.get_explode_infos_prefix()
8646
8647            # PED param
8648            ped = (
8649                param.get("calculation", {})
8650                .get("calculations", {})
8651                .get("BARCODEFAMILY", {})
8652                .get("family_pedigree", None)
8653            )
8654            log.debug(f"ped={ped}")
8655
8656            # Load PED
8657            if ped:
8658
8659                # Pedigree is a file
8660                if isinstance(ped, str) and os.path.exists(full_path(ped)):
8661                    log.debug("Pedigree is file")
8662                    with open(full_path(ped)) as ped:
8663                        ped = json.load(ped)
8664
8665                # Pedigree is a string
8666                elif isinstance(ped, str):
8667                    log.debug("Pedigree is str")
8668                    try:
8669                        ped = json.loads(ped)
8670                        log.debug("Pedigree is json str")
8671                    except ValueError as e:
8672                        ped_samples = ped.split(",")
8673                        ped = {}
8674                        for ped_sample in ped_samples:
8675                            ped[ped_sample] = ped_sample
8676
8677                # Pedigree is a dict
8678                elif isinstance(ped, dict):
8679                    log.debug("Pedigree is dict")
8680
8681                # Pedigree is not well formatted
8682                else:
8683                    msg_error = "Pedigree not well formatted"
8684                    log.error(msg_error)
8685                    raise ValueError(msg_error)
8686
8687                # Construct list
8688                ped_samples = list(ped.values())
8689
8690            else:
8691                log.debug("Pedigree not defined. Take all samples")
8692                ped_samples = self.get_header_sample_list()
8693                ped = {}
8694                for ped_sample in ped_samples:
8695                    ped[ped_sample] = ped_sample
8696
8697            # Check pedigree
8698            if not ped or len(ped) == 0:
8699                msg_error = f"Error in pedigree: samples {ped_samples}"
8700                log.error(msg_error)
8701                raise ValueError(msg_error)
8702
8703            # Log
8704            log.info(
8705                "Calculation 'BARCODEFAMILY' - Samples: "
8706                + ", ".join([f"{member}='{ped[member]}'" for member in ped])
8707            )
8708            log.debug(f"ped_samples={ped_samples}")
8709
8710            # Field
8711            barcode_infos = prefix + tag
8712
8713            # Variants table
8714            table_variants = self.get_table_variants()
8715
8716            # Header
8717            vcf_reader = self.get_header()
8718
8719            # Create variant id
8720            variant_id_column = self.get_variant_id_column()
8721            added_columns = [variant_id_column]
8722
8723            # variant_id, FORMAT and samples
8724            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8725                ped_samples
8726            )
8727
8728            # Create dataframe
8729            dataframe_barcode = self.get_query_to_df(
8730                f""" SELECT {samples_fields} FROM {table_variants} """
8731            )
8732
8733            # Create barcode column
8734            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
8735                lambda row: barcode(row, samples=ped_samples), axis=1
8736            )
8737
8738            # Add barcode family to header
8739            # Add vaf_normalization to header
8740            vcf_reader.formats[tag] = vcf.parser._Format(
8741                id=tag,
8742                num=".",
8743                type="String",
8744                desc=vcf_infos_tags.get(tag, "barcode family calculation"),
8745                type_code=self.code_type_map.get("String"),
8746            )
8747            vcf_reader.formats[f"{tag}S"] = vcf.parser._Format(
8748                id=f"{tag}S",
8749                num=".",
8750                type="String",
8751                desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"),
8752                type_code=self.code_type_map.get("String"),
8753            )
8754
8755            # Update
8756            # for sample in ped_samples:
8757            sql_update_set = []
8758            for sample in self.get_header_sample_list() + ["FORMAT"]:
8759                if sample in ped_samples:
8760                    value = f'dataframe_barcode."{barcode_infos}"'
8761                    value_samples = "'" + ",".join(ped_samples) + "'"
8762                elif sample == "FORMAT":
8763                    value = f"'{tag}'"
8764                    value_samples = f"'{tag}S'"
8765                else:
8766                    value = "'.'"
8767                    value_samples = "'.'"
8768                format_regex = r"[a-zA-Z0-9\s]"
8769                sql_update_set.append(
8770                    f"""
8771                        "{sample}" = 
8772                        concat(
8773                            CASE
8774                                WHEN {table_variants}."{sample}" = './.'
8775                                THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g'))
8776                                ELSE {table_variants}."{sample}"
8777                            END,
8778                            ':',
8779                            {value},
8780                            ':',
8781                            {value_samples}
8782                        )
8783                    """
8784                )
8785
8786            sql_update_set_join = ", ".join(sql_update_set)
8787            sql_update = f"""
8788                UPDATE {table_variants}
8789                SET {sql_update_set_join}
8790                FROM dataframe_barcode
8791                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
8792            """
8793            self.conn.execute(sql_update)
8794
8795            # Remove added columns
8796            for added_column in added_columns:
8797                self.drop_column(column=added_column)
8798
8799            # Delete dataframe
8800            del dataframe_barcode
8801            gc.collect()

The calculation_barcode_family function calculates barcode values for variants in a VCF file and updates the INFO field in the file with the calculated barcode values.

Parameters
  • tag: The tag parameter in the calculation_barcode_family function is used to specify the barcode tag that will be added to the VCF file during the calculation process. If no value is provided for the tag parameter, the default value used is "BCF", defaults to BCF
def calculation_trio(self) -> None:
8803    def calculation_trio(self) -> None:
8804        """
8805        The `calculation_trio` function performs trio calculations on a VCF file by adding trio
8806        information to the INFO field of each variant.
8807        """
8808
8809        # if FORMAT and samples
8810        if (
8811            "FORMAT" in self.get_header_columns_as_list()
8812            and self.get_header_sample_list()
8813        ):
8814
8815            # trio annotation field
8816            trio_tag = "trio"
8817
8818            # VCF infos tags
8819            vcf_infos_tags = {
8820                "trio": "trio calculation",
8821            }
8822
8823            # Param
8824            param = self.get_param()
8825
8826            # Prefix
8827            prefix = self.get_explode_infos_prefix()
8828
8829            # Trio param
8830            trio_ped = (
8831                param.get("calculation", {})
8832                .get("calculations", {})
8833                .get("TRIO", {})
8834                .get("trio_pedigree", None)
8835            )
8836
8837            # Load trio
8838            if trio_ped:
8839
8840                # Trio pedigree is a file
8841                if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)):
8842                    log.debug("TRIO pedigree is file")
8843                    with open(full_path(trio_ped)) as trio_ped:
8844                        trio_ped = json.load(trio_ped)
8845
8846                # Trio pedigree is a string
8847                elif isinstance(trio_ped, str):
8848                    log.debug("TRIO pedigree is str")
8849                    try:
8850                        trio_ped = json.loads(trio_ped)
8851                        log.debug("TRIO pedigree is json str")
8852                    except ValueError as e:
8853                        trio_samples = trio_ped.split(",")
8854                        if len(trio_samples) == 3:
8855                            trio_ped = {
8856                                "father": trio_samples[0],
8857                                "mother": trio_samples[1],
8858                                "child": trio_samples[2],
8859                            }
8860                            log.debug("TRIO pedigree is list str")
8861                        else:
8862                            msg_error = "TRIO pedigree not well formatted"
8863                            log.error(msg_error)
8864                            raise ValueError(msg_error)
8865
8866                # Trio pedigree is a dict
8867                elif isinstance(trio_ped, dict):
8868                    log.debug("TRIO pedigree is dict")
8869
8870                # Trio pedigree is not well formatted
8871                else:
8872                    msg_error = "TRIO pedigree not well formatted"
8873                    log.error(msg_error)
8874                    raise ValueError(msg_error)
8875
8876                # Construct trio list
8877                trio_samples = [
8878                    trio_ped.get("father", ""),
8879                    trio_ped.get("mother", ""),
8880                    trio_ped.get("child", ""),
8881                ]
8882
8883            else:
8884                log.debug("TRIO pedigree not defined. Take the first 3 samples")
8885                samples_list = self.get_header_sample_list()
8886                if len(samples_list) >= 3:
8887                    trio_samples = self.get_header_sample_list()[0:3]
8888                    trio_ped = {
8889                        "father": trio_samples[0],
8890                        "mother": trio_samples[1],
8891                        "child": trio_samples[2],
8892                    }
8893                else:
8894                    msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}"
8895                    log.error(msg_error)
8896                    raise ValueError(msg_error)
8897
8898            # Check trio pedigree
8899            if not trio_ped or len(trio_ped) != 3:
8900                msg_error = f"Error in TRIO pedigree: {trio_ped}"
8901                log.error(msg_error)
8902                raise ValueError(msg_error)
8903
8904            # Log
8905            log.info(
8906                f"Calculation 'TRIO' - Samples: "
8907                + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped])
8908            )
8909
8910            # Field
8911            trio_infos = prefix + trio_tag
8912
8913            # Variants table
8914            table_variants = self.get_table_variants()
8915
8916            # Header
8917            vcf_reader = self.get_header()
8918
8919            # Create variant id
8920            variant_id_column = self.get_variant_id_column()
8921            added_columns = [variant_id_column]
8922
8923            # variant_id, FORMAT and samples
8924            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8925                self.get_header_sample_list()
8926            )
8927
8928            # Create dataframe
8929            dataframe_trio = self.get_query_to_df(
8930                f""" SELECT {samples_fields} FROM {table_variants} """
8931            )
8932
8933            # Create trio column
8934            dataframe_trio[trio_infos] = dataframe_trio.apply(
8935                lambda row: trio(row, samples=trio_samples), axis=1
8936            )
8937
8938            # Add trio to header
8939            vcf_reader.infos[trio_tag] = vcf.parser._Info(
8940                trio_tag,
8941                ".",
8942                "String",
8943                vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"),
8944                "howard calculation",
8945                "0",
8946                self.code_type_map.get("String"),
8947            )
8948
8949            # Update
8950            sql_update = f"""
8951                UPDATE {table_variants}
8952                SET "INFO" = 
8953                    concat(
8954                        CASE
8955                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8956                            THEN ''
8957                            ELSE concat("INFO", ';')
8958                        END,
8959                        CASE
8960                            WHEN dataframe_trio."{trio_infos}" NOT IN ('','.')
8961                             AND dataframe_trio."{trio_infos}" NOT NULL
8962                            THEN concat(
8963                                    '{trio_tag}=',
8964                                    dataframe_trio."{trio_infos}"
8965                                )
8966                            ELSE ''
8967                        END
8968                    )
8969                FROM dataframe_trio
8970                WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}"
8971            """
8972            self.conn.execute(sql_update)
8973
8974            # Remove added columns
8975            for added_column in added_columns:
8976                self.drop_column(column=added_column)
8977
8978            # Delete dataframe
8979            del dataframe_trio
8980            gc.collect()

The calculation_trio function performs trio calculations on a VCF file by adding trio information to the INFO field of each variant.

def calculation_vaf_normalization(self) -> None:
8982    def calculation_vaf_normalization(self) -> None:
8983        """
8984        The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency)
8985        normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.
8986        :return: The function does not return anything.
8987        """
8988
8989        # if FORMAT and samples
8990        if (
8991            "FORMAT" in self.get_header_columns_as_list()
8992            and self.get_header_sample_list()
8993        ):
8994
8995            # vaf_normalization annotation field
8996            vaf_normalization_tag = "VAF"
8997
8998            # VCF infos tags
8999            vcf_infos_tags = {
9000                "VAF": "VAF Variant Frequency",
9001            }
9002
9003            # Prefix
9004            prefix = self.get_explode_infos_prefix()
9005
9006            # Variants table
9007            table_variants = self.get_table_variants()
9008
9009            # Header
9010            vcf_reader = self.get_header()
9011
9012            # Do not calculate if VAF already exists
9013            if "VAF" in vcf_reader.formats:
9014                log.debug("VAF already on genotypes")
9015                return
9016
9017            # Create variant id
9018            variant_id_column = self.get_variant_id_column()
9019            added_columns = [variant_id_column]
9020
9021            # variant_id, FORMAT and samples
9022            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9023                f""" "{sample}" """ for sample in self.get_header_sample_list()
9024            )
9025
9026            # Create dataframe
9027            query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """
9028            log.debug(f"query={query}")
9029            dataframe_vaf_normalization = self.get_query_to_df(query=query)
9030
9031            vaf_normalization_set = []
9032
9033            # for each sample vaf_normalization
9034            for sample in self.get_header_sample_list():
9035                dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply(
9036                    lambda row: vaf_normalization(row, sample=sample), axis=1
9037                )
9038                vaf_normalization_set.append(
9039                    f""" "{sample}" = dataframe_vaf_normalization."{sample}" """
9040                )
9041
9042            # Add VAF to FORMAT
9043            dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[
9044                "FORMAT"
9045            ].apply(lambda x: str(x) + ":VAF")
9046            vaf_normalization_set.append(
9047                f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """
9048            )
9049
9050            # Add vaf_normalization to header
9051            vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format(
9052                id=vaf_normalization_tag,
9053                num="1",
9054                type="Float",
9055                desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"),
9056                type_code=self.code_type_map.get("Float"),
9057            )
9058
9059            # Create fields to add in INFO
9060            sql_vaf_normalization_set = " , ".join(vaf_normalization_set)
9061
9062            # Update
9063            sql_update = f"""
9064                UPDATE {table_variants}
9065                SET {sql_vaf_normalization_set}
9066                FROM dataframe_vaf_normalization
9067                WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}"
9068
9069            """
9070            self.conn.execute(sql_update)
9071
9072            # Remove added columns
9073            for added_column in added_columns:
9074                self.drop_column(column=added_column)
9075
9076            # Delete dataframe
9077            del dataframe_vaf_normalization
9078            gc.collect()

The calculation_vaf_normalization function calculates the VAF (Variant Allele Frequency) normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.

Returns

The function does not return anything.

def calculation_genotype_stats(self, info: str = 'VAF') -> None:
9080    def calculation_genotype_stats(self, info: str = "VAF") -> None:
9081        """
9082        The `calculation_genotype_stats` function calculates genotype statistics for a given information
9083        field in a VCF file and updates the INFO column of the variants table with the calculated
9084        statistics.
9085
9086        :param info: The `info` parameter is a string that represents the type of information for which
9087        genotype statistics are calculated. It is used to generate various VCF info tags for the
9088        statistics, such as the number of occurrences, the list of values, the minimum value, the
9089        maximum value, the mean, the median, defaults to VAF
9090        :type info: str (optional)
9091        """
9092
9093        # if FORMAT and samples
9094        if (
9095            "FORMAT" in self.get_header_columns_as_list()
9096            and self.get_header_sample_list()
9097        ):
9098
9099            # vaf_stats annotation field
9100            vaf_stats_tag = info + "_stats"
9101
9102            # VCF infos tags
9103            vcf_infos_tags = {
9104                info + "_stats_nb": f"genotype {info} Statistics - number of {info}",
9105                info + "_stats_list": f"genotype {info} Statistics - list of {info}",
9106                info + "_stats_min": f"genotype {info} Statistics - min {info}",
9107                info + "_stats_max": f"genotype {info} Statistics - max {info}",
9108                info + "_stats_mean": f"genotype {info} Statistics - mean {info}",
9109                info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}",
9110                info
9111                + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}",
9112            }
9113
9114            # Prefix
9115            prefix = self.get_explode_infos_prefix()
9116
9117            # Field
9118            vaf_stats_infos = prefix + vaf_stats_tag
9119
9120            # Variants table
9121            table_variants = self.get_table_variants()
9122
9123            # Header
9124            vcf_reader = self.get_header()
9125
9126            # Create variant id
9127            variant_id_column = self.get_variant_id_column()
9128            added_columns = [variant_id_column]
9129
9130            # variant_id, FORMAT and samples
9131            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9132                self.get_header_sample_list()
9133            )
9134
9135            # Create dataframe
9136            dataframe_vaf_stats = self.get_query_to_df(
9137                f""" SELECT {samples_fields} FROM {table_variants} """
9138            )
9139
9140            # Create vaf_stats column
9141            dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply(
9142                lambda row: genotype_stats(
9143                    row, samples=self.get_header_sample_list(), info=info
9144                ),
9145                axis=1,
9146            )
9147
9148            # List of vcf tags
9149            sql_vaf_stats_fields = []
9150
9151            # Check all VAF stats infos
9152            for stat in vcf_infos_tags:
9153
9154                # Extract stats
9155                dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply(
9156                    lambda x: dict(x).get(stat, "")
9157                )
9158
9159                # Add snpeff_hgvs to header
9160                vcf_reader.infos[stat] = vcf.parser._Info(
9161                    stat,
9162                    ".",
9163                    "String",
9164                    vcf_infos_tags.get(stat, "genotype statistics"),
9165                    "howard calculation",
9166                    "0",
9167                    self.code_type_map.get("String"),
9168                )
9169
9170                if len(sql_vaf_stats_fields):
9171                    sep = ";"
9172                else:
9173                    sep = ""
9174
9175                # Create fields to add in INFO
9176                sql_vaf_stats_fields.append(
9177                    f"""
9178                        CASE
9179                            WHEN dataframe_vaf_stats."{stat}" NOT NULL
9180                            THEN concat(
9181                                    '{sep}{stat}=',
9182                                    dataframe_vaf_stats."{stat}"
9183                                )
9184                            ELSE ''
9185                        END
9186                    """
9187                )
9188
9189            # SQL set for update
9190            sql_vaf_stats_fields_set = ",  ".join(sql_vaf_stats_fields)
9191
9192            # Update
9193            sql_update = f"""
9194                UPDATE {table_variants}
9195                SET "INFO" = 
9196                    concat(
9197                        CASE
9198                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
9199                            THEN ''
9200                            ELSE concat("INFO", ';')
9201                        END,
9202                        {sql_vaf_stats_fields_set}
9203                    )
9204                FROM dataframe_vaf_stats
9205                WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}"
9206
9207            """
9208            self.conn.execute(sql_update)
9209
9210            # Remove added columns
9211            for added_column in added_columns:
9212                self.drop_column(column=added_column)
9213
9214            # Delete dataframe
9215            del dataframe_vaf_stats
9216            gc.collect()

The calculation_genotype_stats function calculates genotype statistics for a given information field in a VCF file and updates the INFO column of the variants table with the calculated statistics.

Parameters
  • info: The info parameter is a string that represents the type of information for which genotype statistics are calculated. It is used to generate various VCF info tags for the statistics, such as the number of occurrences, the list of values, the minimum value, the maximum value, the mean, the median, defaults to VAF
def calculation_transcripts_json(self, info: str = 'transcripts_json') -> None:
9218    def calculation_transcripts_json(self, info: str = "transcripts_json") -> None:
9219        """
9220        The function `calculation_transcripts_json` creates a transcripts table and adds an info field
9221        to it if transcripts are available.
9222
9223        :param info: The `info` parameter in the `calculation_transcripts_json` method is a string
9224        parameter that specifies the information field to be used in the transcripts JSON. It has a
9225        default value of "transcripts_json" if no value is provided when calling the method, defaults to
9226        transcripts_json
9227        :type info: str (optional)
9228        """
9229
9230        # Create transcripts table
9231        transcripts_table = self.create_transcript_view()
9232
9233        # Add info field
9234        if transcripts_table:
9235            self.transcript_view_to_variants(
9236                transcripts_table=transcripts_table, transcripts_info_field=info
9237            )
9238        else:
9239            log.info("No Transcripts to process. Check param.json file configuration")

The function calculation_transcripts_json creates a transcripts table and adds an info field to it if transcripts are available.

Parameters
  • info: The info parameter in the calculation_transcripts_json method is a string parameter that specifies the information field to be used in the transcripts JSON. It has a default value of "transcripts_json" if no value is provided when calling the method, defaults to transcripts_json
def create_transcript_view_from_columns_map( self, transcripts_table: str = 'transcripts', columns_maps: dict = {}, added_columns: list = [], temporary_tables: list = None, annotation_fields: list = None) -> tuple[list, list, list]:
9245    def create_transcript_view_from_columns_map(
9246        self,
9247        transcripts_table: str = "transcripts",
9248        columns_maps: dict = {},
9249        added_columns: list = [],
9250        temporary_tables: list = None,
9251        annotation_fields: list = None,
9252    ) -> tuple[list, list, list]:
9253        """
9254        The `create_transcript_view_from_columns_map` function generates a temporary table view based on
9255        specified columns mapping for transcripts data.
9256
9257        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of
9258        the table where the transcripts data is stored or will be stored in the database. This table
9259        typically contains information about transcripts such as Ensembl transcript IDs, gene names, scores,
9260        predictions, etc. It defaults to "transcripts, defaults to transcripts
9261        :type transcripts_table: str (optional)
9262        :param columns_maps: The `columns_maps` parameter is a dictionary that contains information about
9263        how to map columns from a transcripts table to create a view. Each entry in the `columns_maps` list
9264        represents a mapping configuration for a specific set of columns. It typically includes details such
9265        as the main transcript column and additional information columns
9266        :type columns_maps: dict
9267        :param added_columns: The `added_columns` parameter in the `create_transcript_view_from_columns_map`
9268        function is a list that stores the additional columns that will be added to the view being created
9269        based on the columns map provided. These columns are generated by exploding the transcript
9270        information columns along with the main transcript column
9271        :type added_columns: list
9272        :param temporary_tables: The `temporary_tables` parameter in the
9273        `create_transcript_view_from_columns_map` function is a list that stores the names of temporary
9274        tables created during the process of creating a transcript view from a columns map. These temporary
9275        tables are used to store intermediate results or transformations before the final view is generated
9276        :type temporary_tables: list
9277        :param annotation_fields: The `annotation_fields` parameter in the
9278        `create_transcript_view_from_columns_map` function is a list that stores the fields that are used
9279        for annotation in the query view creation process. These fields are extracted from the
9280        `transcripts_column` and `transcripts_infos_columns` specified in the `columns
9281        :type annotation_fields: list
9282        :return: The function `create_transcript_view_from_columns_map` returns a tuple containing three
9283        lists: `added_columns`, `temporary_tables`, and `annotation_fields`.
9284        """
9285
9286        log.debug("Start transcrpts view creation from columns map...")
9287
9288        # "from_columns_map": [
9289        #     {
9290        #         "transcripts_column": "Ensembl_transcriptid",
9291        #         "transcripts_infos_columns": [
9292        #             "genename",
9293        #             "Ensembl_geneid",
9294        #             "LIST_S2_score",
9295        #             "LIST_S2_pred",
9296        #         ],
9297        #     },
9298        #     {
9299        #         "transcripts_column": "Ensembl_transcriptid",
9300        #         "transcripts_infos_columns": [
9301        #             "genename",
9302        #             "VARITY_R_score",
9303        #             "Aloft_pred",
9304        #         ],
9305        #     },
9306        # ],
9307
9308        # Init
9309        if temporary_tables is None:
9310            temporary_tables = []
9311        if annotation_fields is None:
9312            annotation_fields = []
9313
9314        # Variants table
9315        table_variants = self.get_table_variants()
9316
9317        for columns_map in columns_maps:
9318
9319            # Transcript column
9320            transcripts_column = columns_map.get("transcripts_column", None)
9321
9322            # Transcripts infos columns
9323            transcripts_infos_columns = columns_map.get("transcripts_infos_columns", [])
9324
9325            if transcripts_column is not None:
9326
9327                # Explode
9328                added_columns += self.explode_infos(
9329                    fields=[transcripts_column] + transcripts_infos_columns
9330                )
9331
9332                # View clauses
9333                clause_select = []
9334                for field in [transcripts_column] + transcripts_infos_columns:
9335                    clause_select.append(
9336                        f""" regexp_split_to_table("{field}", ',') AS '{field}' """
9337                    )
9338                    if field not in [transcripts_column]:
9339                        annotation_fields.append(field)
9340
9341                # Querey View
9342                query = f""" 
9343                    SELECT
9344                        "#CHROM", POS, REF, ALT,
9345                        "{transcripts_column}" AS 'transcript',
9346                        {", ".join(clause_select)}
9347                    FROM (
9348                        SELECT 
9349                            "#CHROM", POS, REF, ALT,
9350                            {", ".join(clause_select)}
9351                        FROM {table_variants}
9352                        )
9353                    WHERE "{transcripts_column}" IS NOT NULL
9354                """
9355
9356                # Create temporary table
9357                temporary_table = transcripts_table + "".join(
9358                    random.choices(string.ascii_uppercase + string.digits, k=10)
9359                )
9360
9361                # Temporary_tables
9362                temporary_tables.append(temporary_table)
9363                query_view = f"""
9364                    CREATE TEMPORARY TABLE {temporary_table}
9365                    AS ({query})
9366                """
9367                self.execute_query(query=query_view)
9368
9369        return added_columns, temporary_tables, annotation_fields

The create_transcript_view_from_columns_map function generates a temporary table view based on specified columns mapping for transcripts data.

Parameters
  • transcripts_table: The transcripts_table parameter is a string that specifies the name of the table where the transcripts data is stored or will be stored in the database. This table typically contains information about transcripts such as Ensembl transcript IDs, gene names, scores, predictions, etc. It defaults to "transcripts, defaults to transcripts
  • columns_maps: The columns_maps parameter is a dictionary that contains information about how to map columns from a transcripts table to create a view. Each entry in the columns_maps list represents a mapping configuration for a specific set of columns. It typically includes details such as the main transcript column and additional information columns
  • added_columns: The added_columns parameter in the create_transcript_view_from_columns_map function is a list that stores the additional columns that will be added to the view being created based on the columns map provided. These columns are generated by exploding the transcript information columns along with the main transcript column
  • temporary_tables: The temporary_tables parameter in the create_transcript_view_from_columns_map function is a list that stores the names of temporary tables created during the process of creating a transcript view from a columns map. These temporary tables are used to store intermediate results or transformations before the final view is generated
  • annotation_fields: The annotation_fields parameter in the create_transcript_view_from_columns_map function is a list that stores the fields that are used for annotation in the query view creation process. These fields are extracted from the transcripts_column and transcripts_infos_columns specified in the `columns
Returns

The function create_transcript_view_from_columns_map returns a tuple containing three lists: added_columns, temporary_tables, and annotation_fields.

def create_transcript_view_from_column_format( self, transcripts_table: str = 'transcripts', column_formats: dict = {}, temporary_tables: list = None, annotation_fields: list = None) -> tuple[list, list, list]:
9371    def create_transcript_view_from_column_format(
9372        self,
9373        transcripts_table: str = "transcripts",
9374        column_formats: dict = {},
9375        temporary_tables: list = None,
9376        annotation_fields: list = None,
9377    ) -> tuple[list, list, list]:
9378        """
9379        The `create_transcript_view_from_column_format` function generates a transcript view based on
9380        specified column formats, adds additional columns and annotation fields, and returns the list of
9381        temporary tables and annotation fields.
9382
9383        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of
9384        the table containing the transcripts data. This table will be used as the base table for creating
9385        the transcript view. The default value for this parameter is "transcripts", but you can provide a
9386        different table name if needed, defaults to transcripts
9387        :type transcripts_table: str (optional)
9388        :param column_formats: The `column_formats` parameter is a dictionary that contains information
9389        about the columns to be used for creating the transcript view. Each entry in the dictionary
9390        specifies the mapping between a transcripts column and a transcripts infos column. For example, in
9391        the provided code snippet:
9392        :type column_formats: dict
9393        :param temporary_tables: The `temporary_tables` parameter in the
9394        `create_transcript_view_from_column_format` function is a list that stores the names of temporary
9395        views created during the process of creating a transcript view from a column format. These temporary
9396        views are used to manipulate and extract data before generating the final transcript view. It
9397        :type temporary_tables: list
9398        :param annotation_fields: The `annotation_fields` parameter in the
9399        `create_transcript_view_from_column_format` function is a list that stores the annotation fields
9400        that are extracted from the temporary views created during the process. These annotation fields are
9401        obtained by querying the temporary views and extracting the column names excluding specific columns
9402        like `#CH
9403        :type annotation_fields: list
9404        :return: The `create_transcript_view_from_column_format` function returns two lists:
9405        `temporary_tables` and `annotation_fields`.
9406        """
9407
9408        log.debug("Start transcrpts view creation from column format...")
9409
9410        #  "from_column_format": [
9411        #     {
9412        #         "transcripts_column": "ANN",
9413        #         "transcripts_infos_column": "Feature_ID",
9414        #     }
9415        # ],
9416
9417        # Init
9418        if temporary_tables is None:
9419            temporary_tables = []
9420        if annotation_fields is None:
9421            annotation_fields = []
9422
9423        for column_format in column_formats:
9424
9425            # annotation field and transcript annotation field
9426            annotation_field = column_format.get("transcripts_column", "ANN")
9427            transcript_annotation = column_format.get(
9428                "transcripts_infos_column", "Feature_ID"
9429            )
9430
9431            # Temporary View name
9432            temporary_view_name = transcripts_table + "".join(
9433                random.choices(string.ascii_uppercase + string.digits, k=10)
9434            )
9435
9436            # Create temporary view name
9437            temporary_view_name = self.annotation_format_to_table(
9438                uniquify=True,
9439                annotation_field=annotation_field,
9440                view_name=temporary_view_name,
9441                annotation_id=transcript_annotation,
9442            )
9443
9444            # Annotation fields
9445            if temporary_view_name:
9446                query_annotation_fields = f"""
9447                    SELECT *
9448                    FROM (
9449                        DESCRIBE SELECT *
9450                        FROM {temporary_view_name}
9451                        )
9452                        WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT')
9453                """
9454                df_annotation_fields = self.get_query_to_df(
9455                    query=query_annotation_fields
9456                )
9457
9458                # Add temporary view and annotation fields
9459                temporary_tables.append(temporary_view_name)
9460                annotation_fields += list(set(df_annotation_fields["column_name"]))
9461
9462        return temporary_tables, annotation_fields

The create_transcript_view_from_column_format function generates a transcript view based on specified column formats, adds additional columns and annotation fields, and returns the list of temporary tables and annotation fields.

Parameters
  • transcripts_table: The transcripts_table parameter is a string that specifies the name of the table containing the transcripts data. This table will be used as the base table for creating the transcript view. The default value for this parameter is "transcripts", but you can provide a different table name if needed, defaults to transcripts
  • column_formats: The column_formats parameter is a dictionary that contains information about the columns to be used for creating the transcript view. Each entry in the dictionary specifies the mapping between a transcripts column and a transcripts infos column. For example, in the provided code snippet:
  • temporary_tables: The temporary_tables parameter in the create_transcript_view_from_column_format function is a list that stores the names of temporary views created during the process of creating a transcript view from a column format. These temporary views are used to manipulate and extract data before generating the final transcript view. It
  • annotation_fields: The annotation_fields parameter in the create_transcript_view_from_column_format function is a list that stores the annotation fields that are extracted from the temporary views created during the process. These annotation fields are obtained by querying the temporary views and extracting the column names excluding specific columns like `#CH
Returns

The create_transcript_view_from_column_format function returns two lists: temporary_tables and annotation_fields.

def create_transcript_view( self, transcripts_table: str = None, transcripts_table_drop: bool = True, param: dict = {}) -> str:
9464    def create_transcript_view(
9465        self,
9466        transcripts_table: str = None,
9467        transcripts_table_drop: bool = True,
9468        param: dict = {},
9469    ) -> str:
9470        """
9471        The `create_transcript_view` function generates a transcript view by processing data from a
9472        specified table based on provided parameters and structural information.
9473
9474        :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function
9475        is used to specify the name of the table that will store the final transcript view data. If a table
9476        name is not provided, the function will create a new table to store the transcript view data, and by
9477        default,, defaults to transcripts
9478        :type transcripts_table: str (optional)
9479        :param transcripts_table_drop: The `transcripts_table_drop` parameter in the
9480        `create_transcript_view` function is a boolean parameter that determines whether to drop the
9481        existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`,
9482        the function will drop the existing transcripts table if it exists, defaults to True
9483        :type transcripts_table_drop: bool (optional)
9484        :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that
9485        contains information needed to create a transcript view. It includes details such as the structure
9486        of the transcripts, columns mapping, column formats, and other necessary information for generating
9487        the view. This parameter allows for flexibility and customization
9488        :type param: dict
9489        :return: The `create_transcript_view` function returns the name of the transcripts table that was
9490        created or modified during the execution of the function.
9491        """
9492
9493        log.debug("Start transcrpts view creation...")
9494
9495        # Default
9496        transcripts_table_default = "transcripts"
9497
9498        # Param
9499        if not param:
9500            param = self.get_param()
9501
9502        # Struct
9503        struct = param.get("transcripts", {}).get("struct", None)
9504
9505        if struct:
9506
9507            # Transcripts table
9508            if transcripts_table is None:
9509                transcripts_table = param.get("transcripts", {}).get(
9510                    "table", transcripts_table_default
9511                )
9512
9513            # added_columns
9514            added_columns = []
9515
9516            # Temporary tables
9517            temporary_tables = []
9518
9519            # Annotation fields
9520            annotation_fields = []
9521
9522            # from columns map
9523            columns_maps = struct.get("from_columns_map", [])
9524            added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = (
9525                self.create_transcript_view_from_columns_map(
9526                    transcripts_table=transcripts_table,
9527                    columns_maps=columns_maps,
9528                    added_columns=added_columns,
9529                    temporary_tables=temporary_tables,
9530                    annotation_fields=annotation_fields,
9531                )
9532            )
9533            added_columns += added_columns_tmp
9534            temporary_tables += temporary_tables_tmp
9535            annotation_fields += annotation_fields_tmp
9536
9537            # from column format
9538            column_formats = struct.get("from_column_format", [])
9539            temporary_tables_tmp, annotation_fields_tmp = (
9540                self.create_transcript_view_from_column_format(
9541                    transcripts_table=transcripts_table,
9542                    column_formats=column_formats,
9543                    temporary_tables=temporary_tables,
9544                    annotation_fields=annotation_fields,
9545                )
9546            )
9547            temporary_tables += temporary_tables_tmp
9548            annotation_fields += annotation_fields_tmp
9549
9550            # Merge temporary tables query
9551            query_merge = ""
9552            for temporary_table in temporary_tables:
9553
9554                # First temporary table
9555                if not query_merge:
9556                    query_merge = f"""
9557                        SELECT * FROM {temporary_table}
9558                    """
9559                # other temporary table (using UNION)
9560                else:
9561                    query_merge += f"""
9562                        UNION BY NAME SELECT * FROM {temporary_table}
9563                    """
9564
9565            # Merge on transcript
9566            query_merge_on_transcripts_annotation_fields = []
9567            # Aggregate all annotations fields
9568            for annotation_field in set(annotation_fields):
9569                query_merge_on_transcripts_annotation_fields.append(
9570                    f""" list_aggregate(list_distinct(array_agg({annotation_field})), 'string_agg', ',') AS {annotation_field} """
9571                )
9572            # Query for transcripts view
9573            query_merge_on_transcripts = f"""
9574                SELECT "#CHROM", POS, REF, ALT, transcript, {", ".join(query_merge_on_transcripts_annotation_fields)}
9575                FROM ({query_merge})
9576                GROUP BY "#CHROM", POS, REF, ALT, transcript
9577            """
9578
9579            # Drop transcript view is necessary
9580            if transcripts_table_drop:
9581                query_drop = f"""
9582                    DROP TABLE IF EXISTS {transcripts_table};
9583                """
9584                self.execute_query(query=query_drop)
9585
9586            # Merge and create transcript view
9587            query_create_view = f"""
9588                CREATE TABLE IF NOT EXISTS {transcripts_table}
9589                AS {query_merge_on_transcripts}
9590            """
9591            self.execute_query(query=query_create_view)
9592
9593            # Remove added columns
9594            for added_column in added_columns:
9595                self.drop_column(column=added_column)
9596
9597        else:
9598
9599            transcripts_table = None
9600
9601        return transcripts_table

The create_transcript_view function generates a transcript view by processing data from a specified table based on provided parameters and structural information.

Parameters
  • transcripts_table: The transcripts_table parameter in the create_transcript_view function is used to specify the name of the table that will store the final transcript view data. If a table name is not provided, the function will create a new table to store the transcript view data, and by default,, defaults to transcripts
  • transcripts_table_drop: The transcripts_table_drop parameter in the create_transcript_view function is a boolean parameter that determines whether to drop the existing transcripts table before creating a new one. If transcripts_table_drop is set to True, the function will drop the existing transcripts table if it exists, defaults to True
  • param: The param parameter in the create_transcript_view function is a dictionary that contains information needed to create a transcript view. It includes details such as the structure of the transcripts, columns mapping, column formats, and other necessary information for generating the view. This parameter allows for flexibility and customization
Returns

The create_transcript_view function returns the name of the transcripts table that was created or modified during the execution of the function.

def annotation_format_to_table( self, uniquify: bool = True, annotation_field: str = 'ANN', annotation_id: str = 'Feature_ID', view_name: str = 'transcripts') -> str:
9603    def annotation_format_to_table(
9604        self,
9605        uniquify: bool = True,
9606        annotation_field: str = "ANN",
9607        annotation_id: str = "Feature_ID",
9608        view_name: str = "transcripts",
9609    ) -> str:
9610        """
9611        The function `annotation_format_to_table` converts annotation data from a VCF file into a structured
9612        table format.
9613
9614        :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure unique
9615        values in the output or not. If set to `True`, the function will make sure that the output values
9616        are unique, defaults to True
9617        :type uniquify: bool (optional)
9618        :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file that
9619        contains the annotation information for each variant. This field is used to extract the annotation
9620        details for further processing in the function, defaults to ANN
9621        :type annotation_field: str (optional)
9622        :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method is
9623        used to specify the identifier for the annotation feature. This identifier will be used as a column
9624        name in the resulting table or view that is created based on the annotation data. It helps in
9625        uniquely identifying each annotation entry in the, defaults to Feature_ID
9626        :type annotation_id: str (optional)
9627        :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used to
9628        specify the name of the temporary table that will be created to store the transformed annotation
9629        data. This table will hold the extracted information from the annotation field in a structured
9630        format for further processing or analysis, defaults to transcripts
9631        :type view_name: str (optional)
9632        :return: The function `annotation_format_to_table` is returning the name of the view created, which
9633        is stored in the variable `view_name`.
9634        """
9635
9636        # Annotation field
9637        annotation_format = "annotation_explode"
9638
9639        # Transcript annotation
9640        annotation_id = "".join(char for char in annotation_id if char.isalnum())
9641
9642        # Prefix
9643        prefix = self.get_explode_infos_prefix()
9644        if prefix:
9645            prefix = "INFO/"
9646
9647        # Annotation fields
9648        annotation_infos = prefix + annotation_field
9649        annotation_format_infos = prefix + annotation_format
9650
9651        # Variants table
9652        table_variants = self.get_table_variants()
9653
9654        # Header
9655        vcf_reader = self.get_header()
9656
9657        # Add columns
9658        added_columns = []
9659
9660        # Explode HGVS field in column
9661        added_columns += self.explode_infos(fields=[annotation_field])
9662
9663        if annotation_field in vcf_reader.infos:
9664
9665            # Extract ANN header
9666            ann_description = vcf_reader.infos[annotation_field].desc
9667            pattern = r"'(.+?)'"
9668            match = re.search(pattern, ann_description)
9669            if match:
9670                ann_header_match = match.group(1).split(" | ")
9671                ann_header = []
9672                ann_header_desc = {}
9673                for i in range(len(ann_header_match)):
9674                    ann_header_info = "".join(
9675                        char for char in ann_header_match[i] if char.isalnum()
9676                    )
9677                    ann_header.append(ann_header_info)
9678                    ann_header_desc[ann_header_info] = ann_header_match[i]
9679                if not ann_header_desc:
9680                    raise ValueError("Invalid header description format")
9681            else:
9682                raise ValueError("Invalid header description format")
9683
9684            # Create variant id
9685            variant_id_column = self.get_variant_id_column()
9686            added_columns += [variant_id_column]
9687
9688            # Create dataframe
9689            dataframe_annotation_format = self.get_query_to_df(
9690                f""" SELECT "#CHROM", POS, REF, ALT, "{variant_id_column}", "{annotation_infos}" FROM {table_variants} """
9691            )
9692
9693            # Create annotation columns
9694            dataframe_annotation_format[
9695                annotation_format_infos
9696            ] = dataframe_annotation_format[annotation_infos].apply(
9697                lambda x: explode_annotation_format(
9698                    annotation=str(x),
9699                    uniquify=uniquify,
9700                    output_format="JSON",
9701                    prefix="",
9702                    header=list(ann_header_desc.values()),
9703                )
9704            )
9705
9706            # Find keys
9707            query_json = f"""SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' FROM dataframe_annotation_format;"""
9708            df_keys = self.get_query_to_df(query=query_json)
9709
9710            # Check keys
9711            query_json_key = []
9712            for _, row in df_keys.iterrows():
9713
9714                # Key
9715                key = row.iloc[0]
9716
9717                # key_clean
9718                key_clean = "".join(char for char in key if char.isalnum())
9719
9720                # Type
9721                query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');"""
9722
9723                # Get DataFrame from query
9724                df_json_type = self.get_query_to_df(query=query_json_type)
9725
9726                # Fill missing values with empty strings and then replace empty strings or None with NaN and drop rows with NaN
9727                with pd.option_context("future.no_silent_downcasting", True):
9728                    df_json_type.fillna(value="", inplace=True)
9729                    replace_dict = {None: np.nan, "": np.nan}
9730                    df_json_type.replace(replace_dict, inplace=True)
9731                    df_json_type.dropna(inplace=True)
9732
9733                # Detect column type
9734                column_type = detect_column_type(df_json_type[key_clean])
9735
9736                # Append
9737                query_json_key.append(
9738                    f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type}  AS '{prefix}{key_clean}' """
9739                )
9740
9741            # Create view
9742            query_view = f"""CREATE TEMPORARY TABLE {view_name} AS (SELECT *, {annotation_id} AS 'transcript' FROM (SELECT "#CHROM", POS, REF, ALT, {",".join(query_json_key)} FROM dataframe_annotation_format));"""
9743            self.execute_query(query=query_view)
9744
9745        else:
9746
9747            # Return None
9748            view_name = None
9749
9750        # Remove added columns
9751        for added_column in added_columns:
9752            self.drop_column(column=added_column)
9753
9754        return view_name

The function annotation_format_to_table converts annotation data from a VCF file into a structured table format.

Parameters
  • uniquify: The uniquify parameter is a boolean flag that determines whether to ensure unique values in the output or not. If set to True, the function will make sure that the output values are unique, defaults to True
  • annotation_field: The annotation_field parameter refers to the field in the VCF file that contains the annotation information for each variant. This field is used to extract the annotation details for further processing in the function, defaults to ANN
  • annotation_id: The annotation_id parameter in the annotation_format_to_table method is used to specify the identifier for the annotation feature. This identifier will be used as a column name in the resulting table or view that is created based on the annotation data. It helps in uniquely identifying each annotation entry in the, defaults to Feature_ID
  • view_name: The view_name parameter in the annotation_format_to_table method is used to specify the name of the temporary table that will be created to store the transformed annotation data. This table will hold the extracted information from the annotation field in a structured format for further processing or analysis, defaults to transcripts
Returns

The function annotation_format_to_table is returning the name of the view created, which is stored in the variable view_name.

def transcript_view_to_variants( self, transcripts_table: str = None, transcripts_column_id: str = None, transcripts_info_json: str = None, transcripts_info_field: str = None, param: dict = {}) -> bool:
9756    def transcript_view_to_variants(
9757        self,
9758        transcripts_table: str = None,
9759        transcripts_column_id: str = None,
9760        transcripts_info_json: str = None,
9761        transcripts_info_field: str = None,
9762        param: dict = {},
9763    ) -> bool:
9764        """
9765        The function `transcript_view_to_variants` takes input parameters related to transcripts and updates
9766        a variants table with information from the transcripts in JSON format.
9767
9768        :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the table
9769        containing the transcripts data. If this parameter is not provided, the function will attempt to
9770        retrieve it from the `param` dictionary or use a default value of "transcripts"
9771        :type transcripts_table: str
9772        :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the column in
9773        the `transcripts_table` that contains the unique identifier for each transcript. This identifier is
9774        used to match transcripts with variants in the database
9775        :type transcripts_column_id: str
9776        :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name of
9777        the column in the variants table where the transcripts information will be stored in JSON format
9778        :type transcripts_info_json: str
9779        :param transcripts_info_field: The `transcripts_info_field` parameter is used to specify the field
9780        in the VCF header that will contain information about transcripts in JSON format. This field will be
9781        added to the VCF header as an INFO field with the specified name
9782        :type transcripts_info_field: str
9783        :param param: The `transcript_view_to_variants` method takes several parameters:
9784        :type param: dict
9785        :return: The function `transcript_view_to_variants` returns a boolean value, which is `True` if the
9786        operation is successful and `False` if certain conditions are not met.
9787        """
9788
9789        log.debug("Start transcripts view to JSON...")
9790
9791        # Default
9792        transcripts_table_default = "transcripts"
9793        transcripts_column_id_default = "transcript"
9794        transcripts_info_json_default = None
9795        transcripts_info_field_default = None
9796
9797        # Param
9798        if not param:
9799            param = self.get_param()
9800
9801        # Transcripts table
9802        if transcripts_table is None:
9803            transcripts_table = param.get("transcripts", {}).get(
9804                "table", transcripts_table_default
9805            )
9806
9807        # Transcripts column ID
9808        if transcripts_column_id is None:
9809            transcripts_column_id = param.get("transcripts", {}).get(
9810                "column_id", transcripts_column_id_default
9811            )
9812
9813        # Transcripts info field
9814        if transcripts_info_json is None:
9815            transcripts_info_json = param.get("transcripts", {}).get(
9816                "transcripts_info_json", transcripts_info_json_default
9817            )
9818
9819        # Transcripts info field
9820        if transcripts_info_field is None:
9821            transcripts_info_field = param.get("transcripts", {}).get(
9822                "transcripts_info_field", transcripts_info_field_default
9823            )
9824
9825        # Variants table
9826        table_variants = self.get_table_variants()
9827
9828        # Check info columns param
9829        if transcripts_info_json is None and transcripts_info_field is None:
9830            return False
9831
9832        # Transcripts infos columns
9833        query_transcripts_infos_columns = f"""
9834            SELECT *
9835            FROM (
9836                DESCRIBE SELECT * FROM {transcripts_table}
9837                )
9838            WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}')
9839        """
9840        transcripts_infos_columns = list(
9841            self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"]
9842        )
9843
9844        # View results
9845        clause_select = []
9846        clause_to_json = []
9847        for field in transcripts_infos_columns:
9848            clause_select.append(
9849                f""" regexp_split_to_table("{field}", ',') AS '{field}' """
9850            )
9851            clause_to_json.append(f""" '{field}': "{field}" """)
9852
9853        # Update
9854        update_set = []
9855
9856        # VCF header
9857        vcf_reader = self.get_header()
9858
9859        # Transcripts to info column in JSON
9860        if transcripts_info_json is not None:
9861
9862            # Create column on variants table
9863            self.add_column(
9864                table_name=table_variants,
9865                column_name=transcripts_info_json,
9866                column_type="JSON",
9867                default_value=None,
9868                drop=False,
9869            )
9870
9871            # Add to update
9872            update_set.append(
9873                f""" {transcripts_info_json}=t.{transcripts_info_json} """
9874            )
9875
9876            # Add header
9877            vcf_reader.infos[transcripts_info_json] = vcf.parser._Info(
9878                transcripts_info_json,
9879                ".",
9880                "String",
9881                "Transcripts in JSON format",
9882                "unknwon",
9883                "unknwon",
9884                self.code_type_map["String"],
9885            )
9886
9887        # Transcripts to info field in JSON
9888        if transcripts_info_field is not None:
9889
9890            # Add to update
9891            update_set.append(
9892                f""" 
9893                    INFO = concat(
9894                            CASE
9895                                WHEN INFO NOT IN ('', '.')
9896                                THEN INFO
9897                                ELSE ''
9898                            END,
9899                            CASE
9900                                WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.')
9901                                THEN concat(
9902                                    ';{transcripts_info_field}=',
9903                                    t.{transcripts_info_json}
9904                                )
9905                                ELSE ''
9906                            END
9907                            )
9908                """
9909            )
9910
9911            # Add header
9912            vcf_reader.infos[transcripts_info_field] = vcf.parser._Info(
9913                transcripts_info_field,
9914                ".",
9915                "String",
9916                "Transcripts in JSON format",
9917                "unknwon",
9918                "unknwon",
9919                self.code_type_map["String"],
9920            )
9921
9922        # Update query
9923        query_update = f"""
9924            UPDATE {table_variants}
9925                SET {", ".join(update_set)}
9926            FROM
9927            (
9928                SELECT
9929                    "#CHROM", POS, REF, ALT,
9930                        concat(
9931                        '{{',
9932                        string_agg(
9933                            '"' || "{transcripts_column_id}" || '":' ||
9934                            to_json(json_output)
9935                        ),
9936                        '}}'
9937                        )::JSON AS {transcripts_info_json}
9938                FROM
9939                    (
9940                    SELECT
9941                        "#CHROM", POS, REF, ALT,
9942                        "{transcripts_column_id}",
9943                        to_json(
9944                            {{{",".join(clause_to_json)}}}
9945                        )::JSON AS json_output
9946                    FROM
9947                        (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
9948                    WHERE "{transcripts_column_id}" IS NOT NULL
9949                    )
9950                GROUP BY "#CHROM", POS, REF, ALT
9951            ) AS t
9952            WHERE {table_variants}."#CHROM" = t."#CHROM"
9953                AND {table_variants}."POS" = t."POS"
9954                AND {table_variants}."REF" = t."REF"
9955                AND {table_variants}."ALT" = t."ALT"
9956        """
9957
9958        self.execute_query(query=query_update)
9959
9960        return True

The function transcript_view_to_variants takes input parameters related to transcripts and updates a variants table with information from the transcripts in JSON format.

Parameters
  • transcripts_table: The transcripts_table parameter is used to specify the name of the table containing the transcripts data. If this parameter is not provided, the function will attempt to retrieve it from the param dictionary or use a default value of "transcripts"
  • transcripts_column_id: The transcripts_column_id parameter is used to specify the column in the transcripts_table that contains the unique identifier for each transcript. This identifier is used to match transcripts with variants in the database
  • transcripts_info_json: The transcripts_info_json parameter is used to specify the name of the column in the variants table where the transcripts information will be stored in JSON format
  • transcripts_info_field: The transcripts_info_field parameter is used to specify the field in the VCF header that will contain information about transcripts in JSON format. This field will be added to the VCF header as an INFO field with the specified name
  • param: The transcript_view_to_variants method takes several parameters:
Returns

The function transcript_view_to_variants returns a boolean value, which is True if the operation is successful and False if certain conditions are not met.